• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

HicServices / RDMP / 6245535001

20 Sep 2023 07:44AM UTC coverage: 57.013%. First build
6245535001

push

github

web-flow
8.1.0 Release (#1628)

* Bump Newtonsoft.Json from 13.0.1 to 13.0.2

Bumps [Newtonsoft.Json](https://github.com/JamesNK/Newtonsoft.Json) from 13.0.1 to 13.0.2.
- [Release notes](https://github.com/JamesNK/Newtonsoft.Json/releases)
- [Commits](https://github.com/JamesNK/Newtonsoft.Json/compare/13.0.1...13.0.2)

---
updated-dependencies:
- dependency-name: Newtonsoft.Json
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump NLog from 5.0.5 to 5.1.0

Bumps [NLog](https://github.com/NLog/NLog) from 5.0.5 to 5.1.0.
- [Release notes](https://github.com/NLog/NLog/releases)
- [Changelog](https://github.com/NLog/NLog/blob/dev/CHANGELOG.md)
- [Commits](https://github.com/NLog/NLog/compare/v5.0.5...v5.1.0)

---
updated-dependencies:
- dependency-name: NLog
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump NLog from 5.0.5 to 5.1.0

* Fix -r flag - should have been --results-directory all along

* Bump Newtonsoft.Json from 13.0.1 to 13.0.2

* Bump YamlDotNet from 12.0.2 to 12.1.0

Bumps [YamlDotNet](https://github.com/aaubry/YamlDotNet) from 12.0.2 to 12.1.0.
- [Release notes](https://github.com/aaubry/YamlDotNet/releases)
- [Commits](https://github.com/aaubry/YamlDotNet/compare/v12.0.2...v12.1.0)

---
updated-dependencies:
- dependency-name: YamlDotNet
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump Moq from 4.18.2 to 4.18.3

Bumps [Moq](https://github.com/moq/moq4) from 4.18.2 to 4.18.3.
- [Release notes](https://github.com/moq/moq4/releases)
- [Changelog](https://github.com/moq/moq4/blob/main/CHANGELOG.md)
- [Commits](https://github.com/moq/moq4/compare/v4.18.2...v4.18.3)

---
updated-dependencies:
- dependency-name: Moq
... (continued)

10732 of 20257 branches covered (0.0%)

Branch coverage included in aggregate %.

48141 of 48141 new or added lines in 1086 files covered. (100.0%)

30685 of 52388 relevant lines covered (58.57%)

7387.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.9
/Rdmp.Core/DataLoad/Modules/DataFlowSources/SubComponents/FlatFileColumnCollection.cs
1
// Copyright (c) The University of Dundee 2018-2019
2
// This file is part of the Research Data Management Platform (RDMP).
3
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
4
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
5
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.
6

7
using System;
8
using System.Collections.Generic;
9
using System.Collections.ObjectModel;
10
using System.Data;
11
using System.Linq;
12
using System.Text;
13
using CsvHelper;
14
using FAnsi.Discovery;
15
using FAnsi.Extensions;
16
using Rdmp.Core.DataFlowPipeline.Requirements;
17
using Rdmp.Core.DataLoad.Modules.Exceptions;
18
using Rdmp.Core.ReusableLibraryCode;
19
using Rdmp.Core.ReusableLibraryCode.Progress;
20

21
namespace Rdmp.Core.DataLoad.Modules.DataFlowSources.SubComponents;
22

23
/// <summary>
24
/// This class is a sub component of <see cref="DelimitedFlatFileDataFlowSource"/>, it is responsible for processing the headers (or overriding headers)
25
/// of a CSV (TSV etc) file.
26
/// 
27
/// <para>The component has two main operational modes after it has read headers: <see cref="GetDataTableWithHeaders"/> and  <see cref="MakeDataTableFitHeaders"/></para>
28
/// </summary>
29
public class FlatFileColumnCollection
30
{
31
    /// <summary>
32
    /// Text to display in ASCII art of column matches when a column from the source could not be matched
33
    /// with the destination.
34
    /// </summary>
35
    public const string UnmatchedText = "????";
36

37
    private readonly FlatFileToLoad _toLoad;
38
    private readonly bool _makeHeaderNamesSane;
39
    private readonly ExplicitTypingCollection _explicitlyTypedColumns;
40
    private readonly string _forceHeaders;
41
    private readonly bool _forceHeadersReplacesFirstLineInFile;
42
    private readonly string _ignoreColumns;
43

44
    /// <summary>
45
    /// The columns from the file the user does not want to load into the destination (this will not help
46
    /// you avoid bad data).
47
    /// </summary>
48
    public HashSet<string> IgnoreColumnsList { get; private set; }
48,534✔
49

50
    public FlatFileColumnCollection(FlatFileToLoad toLoad, bool makeHeaderNamesSane,
244✔
51
        ExplicitTypingCollection explicitlyTypedColumns, string forceHeaders, bool forceHeadersReplacesFirstLineInFile,
244✔
52
        string ignoreColumns)
244✔
53
    {
54
        _toLoad = toLoad;
244✔
55
        _makeHeaderNamesSane = makeHeaderNamesSane;
244✔
56
        _explicitlyTypedColumns = explicitlyTypedColumns;
244✔
57
        _forceHeaders = forceHeaders;
244✔
58
        _forceHeadersReplacesFirstLineInFile = forceHeadersReplacesFirstLineInFile;
244✔
59
        _ignoreColumns = ignoreColumns;
244✔
60
    }
244✔
61

62
    public string this[int index] => _headers[index];
159,176✔
63

64
    private enum State
65
    {
66
        Start,
67
        AfterHeadersRead,
68
        AfterTableGenerated
69
    }
70

71
    private State _state = State.Start;
72

73
    /// <summary>
74
    /// The Headers found in the file / overridden by ForceHeaders
75
    /// </summary>
76
    private string[] _headers;
77

78
    /// <summary>
79
    /// Column headers that appear in the middle of the file (i.e. not trailing) but that don't have a header name.  These get thrown away
80
    /// and they must never have data in them.  This lets you have a full blank column in the middle of your file e.g. if you have inserted
81
    /// it via Excel
82
    /// </summary>
83
    public ReadOnlyCollection<DataColumn> UnamedColumns = new(Array.Empty<DataColumn>()); //start off with none
244✔
84

85
    public bool FileIsEmpty;
86

87
    /// <summary>
88
    /// used to advise user if he has selected the wrong separator
89
    /// </summary>
90
    private string[] _commonSeparators = { "|", ",", "    ", "#" };
244✔
91

92
    /// <summary>
93
    /// Counts the number of headers that are not null
94
    /// </summary>
95
    public int CountNotNull
96
    {
97
        get { return _headers.Except(IgnoreColumnsList).Count(h => !h.IsBasicallyNull()); }
46,924✔
98
    }
99

100
    /// <summary>
101
    /// The number of headers including null ones (but not trailing null headers)
102
    /// </summary>
103
    public int Length => _headers.Length;
44,832✔
104

105

106
    public void GetHeadersFromFile(CsvReader r)
107
    {
108
        //check state
109
        if (_state != State.Start)
242!
110
            throw new Exception($"Illegal state, headers cannot be read at state {_state}");
×
111

112
        _state = State.AfterHeadersRead;
242✔
113

114

115
        //if we are not forcing headers we must get them from the file
116
        if (string.IsNullOrWhiteSpace(_forceHeaders))
242✔
117
        {
118
            //read the first record from the file (this will read the header and first row
119
            var empty = !r.Read();
216✔
120

121
            if (empty)
216✔
122
            {
123
                FileIsEmpty = true;
4✔
124
                return;
4✔
125
            }
126

127
            //get headers from first line of the file
128
            r.ReadHeader();
212✔
129
            _headers = r.HeaderRecord;
212✔
130
        }
131
        else
132
        {
133
            //user has some specific headers he wants to override with
134
            _headers = _forceHeaders.Split(new[] { r.Configuration.Delimiter }, StringSplitOptions.None);
26✔
135
        }
136

137
        //ignore these columns (trimmed and ignoring case)
138
        if (!string.IsNullOrWhiteSpace(_ignoreColumns))
238✔
139
            IgnoreColumnsList = new HashSet<string>(
12✔
140
                _ignoreColumns.Split(new[] { r.Configuration.Delimiter }, StringSplitOptions.None)
12✔
141
                    .Select(h => h.Trim())
16✔
142
                , StringComparer.CurrentCultureIgnoreCase);
12✔
143
        else
144
            IgnoreColumnsList = new HashSet<string>();
226✔
145

146
        //Make adjustments to the headers (trim etc)
147

148
        //trim them
149
        for (var i = 0; i < _headers.Length; i++)
2,104✔
150
            if (!string.IsNullOrWhiteSpace(_headers[i]))
814✔
151
                _headers[i] = _headers[i].Trim();
794✔
152

153
        //throw away trailing null headers e.g. the header line "Name,Date,,,"
154
        var trailingNullHeaders = _headers.Reverse().TakeWhile(s => s.IsBasicallyNull()).Count();
488✔
155

156
        if (trailingNullHeaders > 0)
238✔
157
            _headers = _headers.Take(_headers.Length - trailingNullHeaders).ToArray();
10✔
158

159
        //and maybe also help them out with a bit of sanity fixing
160
        if (_makeHeaderNamesSane)
238✔
161
            for (var i = 0; i < _headers.Length; i++)
24✔
162
                _headers[i] = QuerySyntaxHelper.MakeHeaderNameSensible(_headers[i]);
10✔
163
    }
238✔
164

165

166
    /// <summary>
167
    /// Creates a new empty DataTable has only the columns found in the headers that were read during <see cref="GetHeadersFromFile"/>
168
    /// </summary>
169
    /// <param name="listener"></param>
170
    /// <returns></returns>
171
    public DataTable GetDataTableWithHeaders(IDataLoadEventListener listener)
172
    {
173
        if (_state != State.AfterHeadersRead)
152!
174
            throw new Exception($"Illegal state, data table cannot be created at state {_state}");
×
175

176
        _state = State.AfterTableGenerated;
152✔
177

178
        var dt = new DataTable();
152✔
179

180
        var duplicateHeaders = new List<string>();
152✔
181
        var unamedColumns = new List<DataColumn>();
152✔
182

183
        //create a string column for each header - these will change type once we have read some data
184
        foreach (var header in _headers)
1,184✔
185
        {
186
            var h = header;
440✔
187

188
            //if we are ignoring this column
189
            if (h != null && IgnoreColumnsList.Contains(h.Trim()))
440✔
190
                continue; //skip adding to dt
191

192
            //watch for duplicate columns
193
            if (dt.Columns.Contains(header))
436!
194
                if (_makeHeaderNamesSane)
×
195
                {
196
                    h = MakeHeaderUnique(header, dt.Columns, listener, this);
×
197
                }
198
                else
199
                {
200
                    duplicateHeaders.Add(header);
×
201
                    continue;
×
202
                }
203

204
            if (h.IsBasicallyNull())
436✔
205
            {
206
                unamedColumns.Add(dt.Columns.Add(h));
4✔
207
            }
208
            else
209
            //override type
210
            if (_explicitlyTypedColumns?.ExplicitTypesCSharp.TryGetValue(h, out var t) == true)
432✔
211
            {
212
                var c = dt.Columns.Add(h, t);
4✔
213

214
                //if the user wants a string don't let downstream components pick a different Type (by assuming it is is untyped)
215
                if (c.DataType == typeof(string))
4✔
216
                    c.SetDoNotReType(true);
4✔
217
            }
218
            else
219
            {
220
                dt.Columns.Add(h);
428✔
221
            }
222
        }
223

224
        UnamedColumns = new ReadOnlyCollection<DataColumn>(unamedColumns);
152✔
225

226
        return duplicateHeaders.Any()
152!
227
            ? throw new FlatFileLoadException(
152✔
228
                $"Found the following duplicate headers in file '{_toLoad.File}':{string.Join(",", duplicateHeaders)}")
152✔
229
            : dt;
152✔
230
    }
231

232
    /// <summary>
233
    /// Takes an existing DataTable with a fixed schema and validates the columns read during <see cref="GetHeadersFromFile"/> against it making minor changes
234
    /// where appropriate to match the schema
235
    /// </summary>
236
    /// <param name="dt"></param>
237
    /// <param name="listener"></param>
238
    /// <returns></returns>
239
    public void MakeDataTableFitHeaders(DataTable dt, IDataLoadEventListener listener)
240
    {
241
        if (_state != State.AfterHeadersRead)
86!
242
            throw new Exception($"Illegal state, data table cannot be created at state {_state}");
×
243

244
        _state = State.AfterTableGenerated;
86✔
245

246
        var ASCIIArt = new StringBuilder();
86✔
247

248
        var headersNotFound = new List<string>();
86✔
249

250
        for (var index = 0; index < _headers.Length; index++)
888✔
251
        {
252
            ASCIIArt.Append($"[{index}]");
358✔
253

254
            if (dt.Columns.Contains(_headers[index])) //exact match
358✔
255
            {
256
                ASCIIArt.AppendLine($"{_headers[index]}>>>{_headers[index]}");
350✔
257
                continue;
350✔
258
            }
259

260
            if (string.IsNullOrWhiteSpace(_headers[index])) //Empty column header, ignore it
8!
261
            {
262
                ASCIIArt.AppendLine("Blank Column>>>IGNORED");
×
263
                continue;
×
264
            }
265

266
            //if we are ignoring the header
267
            if (IgnoreColumnsList.Contains(_headers[index]))
8✔
268
            {
269
                ASCIIArt.AppendLine($"{_headers[index]}>>>IGNORED");
6✔
270
                continue;
6✔
271
            }
272

273
            //try replacing spaces with underscores
274
            if (dt.Columns.Contains(_headers[index].Replace(" ", "_")))
2!
275
            {
276
                var before = _headers[index];
×
277
                _headers[index] = _headers[index].Replace(" ", "_");
×
278

279
                ASCIIArt.AppendLine($"{before}>>>{_headers[index]}");
×
280
                continue;
×
281
            }
282

283
            //try replacing spaces with nothing
284
            if (dt.Columns.Contains(_headers[index].Replace(" ", "")))
2!
285
            {
286
                var before = _headers[index];
×
287
                _headers[index] = _headers[index].Replace(" ", "");
×
288

289
                ASCIIArt.AppendLine($"{before}>>>{_headers[index]}");
×
290
                continue;
×
291
            }
292

293
            ASCIIArt.AppendLine($"{_headers[index]}>>>{UnmatchedText}");
2✔
294
            headersNotFound.Add(_headers[index]);
2✔
295
        }
296

297
        //now that we have adjusted the header names
298
        var unmatchedColumns =
86✔
299
            dt.Columns.Cast<DataColumn>()
86✔
300
                .Where(c => !_headers.Any(h =>
356✔
301
                    h != null &&
3,750!
302
                    h.ToLower().Equals(c.ColumnName
3,750✔
303
                        .ToLower()))) //get all columns in data table where there are not any with the same name
3,750✔
304
                .Select(c => c.ColumnName)
6✔
305
                .ToArray();
86✔
306

307
        if (unmatchedColumns.Any())
86✔
308
            ASCIIArt.AppendLine(
4✔
309
                $"{Environment.NewLine}Unmatched Columns In DataTable:{Environment.NewLine}{string.Join(Environment.NewLine, unmatchedColumns)}");
4✔
310

311
        //if there is exactly 1 column found by the program and there are unmatched columns it is likely the user has selected the wrong separator
312
        if (_headers.Length == 1 && unmatchedColumns.Any())
86✔
313
            foreach (var commonSeparator in _commonSeparators)
10✔
314
                if (_headers[0].Contains(commonSeparator))
4✔
315
                    listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Error,
2✔
316
                        $"Your separator does not appear in the headers line of your file ({_toLoad.File.Name}) but the separator '{commonSeparator}' does... did you mean to set the Separator to '{commonSeparator}'? The headers line is:\"{_headers[0]}\""));
2✔
317

318
        listener.OnNotify(this, new NotifyEventArgs(
84!
319
            headersNotFound.Any()
84✔
320
                ? ProgressEventType.Error
84✔
321
                : ProgressEventType.Information, //information or warning if there are unrecognised field names
84✔
322
            $"I will now tell you about how the columns in your file do or do not match the columns in your database, Matching flat file columns (or forced replacement headers) against database headers resulted in:{Environment.NewLine}{ASCIIArt}")); //tell them about what columns match what
84✔
323

324

325
        if (headersNotFound.Any())
84!
326
            throw new Exception(
×
327
                $"Could not find a suitable target column for flat file columns {string.Join(",", headersNotFound)} amongst database data table columns ({string.Join(",", from DataColumn col in dt.Columns select col.ColumnName)})");
×
328
    }
84✔
329

330
    public static string MakeHeaderUnique(string newColumnName, DataColumnCollection columnsSoFar,
331
        IDataLoadEventListener listener, object sender)
332
    {
333
        //if it is already unique then that's fine
334
        if (!columnsSoFar.Contains(newColumnName))
×
335
            return newColumnName;
×
336

337
        //otherwise issue a rename
338
        var number = 2;
×
339
        while (columnsSoFar.Contains($"{newColumnName}_{number}"))
×
340
            number++;
×
341

342
        var newName = $"{newColumnName}_{number}";
×
343

344
        //found a novel number
345
        listener.OnNotify(sender, new NotifyEventArgs(ProgressEventType.Warning,
×
346
            $"Renamed duplicate column '{newColumnName}' to '{newName}'"));
×
347
        return newName;
×
348
    }
349

350
    /// <summary>
351
    /// Use only when ForceHeaders is on and ForceHeadersReplacesFirstLineInFile is true.  Pass the header line that was read from the file
352
    /// that will be ignored (<paramref name="row"/>).  This method will show the user what replacements were made.
353
    /// </summary>
354
    /// <param name="row"></param>
355
    /// <param name="listener"></param>
356
    public void ShowForceHeadersAsciiArt(IReaderRow row, IDataLoadEventListener listener)
357
    {
358
        if (_state < State.AfterHeadersRead)
14!
359
            throw new Exception($"Illegal state:{_state}");
×
360

361
        if (string.IsNullOrWhiteSpace(_forceHeaders))
14!
362
            throw new Exception("There are no force headers! how did we get here");
×
363

364
        if (!_forceHeadersReplacesFirstLineInFile)
14!
365
            throw new Exception("Headers do not replace the first line in the file, how did we get here!");
×
366

367
        //create an ascii art representation of the headers being replaced in the format
368
        //[0]MySensibleCol>>>My Silly Coll#
369
        var asciiArt = new StringBuilder();
14✔
370
        for (var i = 0; i < _headers.Length; i++)
96✔
371
        {
372
            asciiArt.Append($"[{i}]{_headers[i]}>>>");
34✔
373
            asciiArt.AppendLine(i < row.ColumnCount ? row[i] : "???");
34!
374
        }
375

376
        for (var i = _headers.Length; i < row.ColumnCount; i++)
28!
377
            asciiArt.AppendLine($"[{i}]???>>>{row[i]}");
×
378

379
        listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
14✔
380
            $"Your attacher has ForceHeaders and ForceHeadersReplacesFirstLineInFile=true, I will now tell you about the first line of data in the file that you skipped (and how it related to your forced headers).  Replacement headers are {Environment.NewLine}{Environment.NewLine}{asciiArt}"));
14✔
381

382
        if (row.ColumnCount != _headers.Length)
14✔
383
            listener.OnNotify(this,
14✔
384
                new NotifyEventArgs(ProgressEventType.Warning,
14✔
385
                    "The number of ForceHeader replacement headers specified does not match the number of headers in the file (being replaced)"));
14✔
386

387
        var discarded = new StringBuilder();
14✔
388
        for (var i = 0; i < row.ColumnCount; i++)
28!
389
        {
390
            if (i > 0)
×
391
                discarded.Append(',');
×
392
            discarded.Append(row[i]);
×
393
        }
394

395
        listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
14✔
396
            $"Skipped first line of file because there are forced replacement headers, we discarded: {discarded}"));
14✔
397
    }
14✔
398
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc