• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

HicServices / RDMP / 6245535001

20 Sep 2023 07:44AM UTC coverage: 57.013%. First build
6245535001

push

github

web-flow
8.1.0 Release (#1628)

* Bump Newtonsoft.Json from 13.0.1 to 13.0.2

Bumps [Newtonsoft.Json](https://github.com/JamesNK/Newtonsoft.Json) from 13.0.1 to 13.0.2.
- [Release notes](https://github.com/JamesNK/Newtonsoft.Json/releases)
- [Commits](https://github.com/JamesNK/Newtonsoft.Json/compare/13.0.1...13.0.2)

---
updated-dependencies:
- dependency-name: Newtonsoft.Json
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump NLog from 5.0.5 to 5.1.0

Bumps [NLog](https://github.com/NLog/NLog) from 5.0.5 to 5.1.0.
- [Release notes](https://github.com/NLog/NLog/releases)
- [Changelog](https://github.com/NLog/NLog/blob/dev/CHANGELOG.md)
- [Commits](https://github.com/NLog/NLog/compare/v5.0.5...v5.1.0)

---
updated-dependencies:
- dependency-name: NLog
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump NLog from 5.0.5 to 5.1.0

* Fix -r flag - should have been --results-directory all along

* Bump Newtonsoft.Json from 13.0.1 to 13.0.2

* Bump YamlDotNet from 12.0.2 to 12.1.0

Bumps [YamlDotNet](https://github.com/aaubry/YamlDotNet) from 12.0.2 to 12.1.0.
- [Release notes](https://github.com/aaubry/YamlDotNet/releases)
- [Commits](https://github.com/aaubry/YamlDotNet/compare/v12.0.2...v12.1.0)

---
updated-dependencies:
- dependency-name: YamlDotNet
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump Moq from 4.18.2 to 4.18.3

Bumps [Moq](https://github.com/moq/moq4) from 4.18.2 to 4.18.3.
- [Release notes](https://github.com/moq/moq4/releases)
- [Changelog](https://github.com/moq/moq4/blob/main/CHANGELOG.md)
- [Commits](https://github.com/moq/moq4/compare/v4.18.2...v4.18.3)

---
updated-dependencies:
- dependency-name: Moq
... (continued)

10732 of 20257 branches covered (0.0%)

Branch coverage included in aggregate %.

48141 of 48141 new or added lines in 1086 files covered. (100.0%)

30685 of 52388 relevant lines covered (58.57%)

7387.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.92
/Rdmp.Core/DataLoad/Modules/DataFlowSources/SubComponents/FlatFileToDataTablePusher.cs
1
// Copyright (c) The University of Dundee 2018-2019
2
// This file is part of the Research Data Management Platform (RDMP).
3
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
4
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
5
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.
6

7
using System;
8
using System.Collections.Generic;
9
using System.Data;
10
using System.Globalization;
11
using System.IO;
12
using System.Linq;
13
using CsvHelper;
14
using Rdmp.Core.DataFlowPipeline.Requirements;
15
using Rdmp.Core.ReusableLibraryCode;
16
using Rdmp.Core.ReusableLibraryCode.Progress;
17
using TypeGuesser;
18
using TypeGuesser.Deciders;
19

20
namespace Rdmp.Core.DataLoad.Modules.DataFlowSources.SubComponents;
21

22
/// <summary>
23
/// This class is a sub component of <see cref="DelimitedFlatFileDataFlowSource"/>, it is responsible for adding rows read from the CSV file to
24
/// the DataTable built by <see cref="FlatFileColumnCollection"/>.
25
/// </summary>
26
public class FlatFileToDataTablePusher
27
{
28
    private readonly FlatFileToLoad _fileToLoad;
29
    private readonly FlatFileColumnCollection _headers;
30
    private readonly Func<string, object> _hackValuesFunc;
31
    private readonly bool _attemptToResolveNewlinesInRecords;
32
    private readonly CultureInfo _culture;
33
    private readonly string _explicitDateTimeFormat;
34
    private TypeDeciderFactory typeDeciderFactory;
35

36
    /// <summary>
37
    /// Used in the event of reading too few cells for the current line.  The pusher will peek at the next lines to see if they
38
    /// make up a coherent row e.g. if a free text field is splitting up the document with newlines.  If the peeked lines do not
39
    /// resolve the problem then the line will be marked as BadData and the peeked records must be reprocessed by <see cref="DelimitedFlatFileDataFlowSource"/>
40
    /// </summary>
41
    public FlatFileLine PeekedRecord;
42

43
    /// <summary>
44
    /// All line numbers of the source file being read that could not be processed.  Allows BadDataFound etc to be called multiple times without skipping
45
    /// records by accident.
46
    /// </summary>
47
    public HashSet<int> BadLines = new();
244✔
48

49
    /// <summary>
50
    /// This is incremented when too many values are read from the file to match the header count BUT the values read were null/empty
51
    /// </summary>
52
    private long _bufferOverrunsWhereColumnValueWasBlank;
53

54
    /// <summary>
55
    /// We only complain once about headers not matching the number of cell values
56
    /// </summary>
57
    private bool _haveComplainedAboutColumnMismatch;
58

59
    public FlatFileToDataTablePusher(FlatFileToLoad fileToLoad, FlatFileColumnCollection headers,
244✔
60
        Func<string, object> hackValuesFunc, bool attemptToResolveNewlinesInRecords, CultureInfo culture,
244✔
61
        string explicitDateTimeFormat)
244✔
62
    {
63
        _fileToLoad = fileToLoad;
244✔
64
        _headers = headers;
244✔
65
        _hackValuesFunc = hackValuesFunc;
244✔
66
        _attemptToResolveNewlinesInRecords = attemptToResolveNewlinesInRecords;
244✔
67
        _culture = culture ?? CultureInfo.CurrentCulture;
244!
68
        _explicitDateTimeFormat = explicitDateTimeFormat;
244✔
69
        typeDeciderFactory = new TypeDeciderFactory(_culture);
244✔
70

71
        if (!string.IsNullOrWhiteSpace(explicitDateTimeFormat))
244✔
72
            typeDeciderFactory.Settings.ExplicitDateFormats = new[] { explicitDateTimeFormat };
2✔
73
    }
244✔
74

75
    public int PushCurrentLine(CsvReader reader, FlatFileLine lineToPush, DataTable dt, IDataLoadEventListener listener,
76
        FlatFileEventHandlers eventHandlers)
77
    {
78
        //skip the blank lines
79
        if (lineToPush.Cells.Length == 0 || lineToPush.Cells.All(h => h.IsBasicallyNull()))
6,466✔
80
            return 0;
60✔
81

82
        var headerCount = _headers.CountNotNull;
3,164✔
83

84
        //if the number of not empty headers doesn't match the headers in the data table
85
        if (dt.Columns.Count != headerCount)
3,164✔
86
            if (!_haveComplainedAboutColumnMismatch)
20✔
87
            {
88
                listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning,
6✔
89
                    $"Flat file '{_fileToLoad.File.Name}' line number '{reader.Context.Parser.RawRow}' had  {headerCount} columns while the destination DataTable had {dt.Columns.Count} columns.  This message appears only once per file"));
6✔
90
                _haveComplainedAboutColumnMismatch = true;
6✔
91
            }
92

93
        var rowValues = new Dictionary<string, object>();
3,164✔
94

95
        if (lineToPush.Cells.Length < headerCount)
3,164✔
96
            if (!DealWithTooFewCellsOnCurrentLine(reader, lineToPush, listener, eventHandlers))
38✔
97
                return 0;
20✔
98

99
        var haveIncremented_bufferOverrunsWhereColumnValueWasBlank = false;
3,128✔
100

101

102
        for (var i = 0; i < lineToPush.Cells.Length; i++)
95,664✔
103
        {
104
            //about to do a buffer overrun
105
            if (i >= _headers.Length)
44,738✔
106
                if (lineToPush[i].IsBasicallyNull())
54✔
107
                {
108
                    if (!haveIncremented_bufferOverrunsWhereColumnValueWasBlank)
20✔
109
                    {
110
                        _bufferOverrunsWhereColumnValueWasBlank++;
12✔
111
                        haveIncremented_bufferOverrunsWhereColumnValueWasBlank = true;
12✔
112
                    }
113

114
                    continue; //do not bother buffer overrunning with null whitespace stuff
12✔
115
                }
116
                else
117
                {
118
                    var errorMessage =
34✔
119
                        $"Column mismatch on line {reader.Context.Parser.RawRow} of file '{dt.TableName}', it has too many columns (expected {_headers.Length} columns but line had  {lineToPush.Cells.Length})";
34✔
120

121
                    if (_bufferOverrunsWhereColumnValueWasBlank > 0)
34!
122
                        errorMessage +=
×
123
                            $" ( {_bufferOverrunsWhereColumnValueWasBlank} Previously lines also suffered from buffer overruns but the overrunning values were empty so we had ignored them up until now)";
×
124

125
                    listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, errorMessage));
34✔
126
                    eventHandlers.BadDataFound(lineToPush);
34✔
127
                    break;
24✔
128
                }
129

130
            //if we are ignoring this header
131
            if (_headers.IgnoreColumnsList.Contains(_headers[i]))
44,684✔
132
                continue;
133

134
            //it's an empty header, don't bother populating it
135
            if (_headers[i].IsBasicallyNull())
43,652✔
136
                if (!lineToPush[i].IsBasicallyNull())
16!
137
                    throw new FileLoadException(
×
138
                        $"The header at index {i} in flat file '{dt.TableName}' had no name but there was a value in the data column (on Line number {reader.Context.Parser.RawRow})");
×
139
                else
140
                    continue;
141

142
            //sometimes flat files have ,NULL,NULL,"bob" in instead of ,,"bob"
143
            if (lineToPush[i].IsBasicallyNull())
43,636✔
144
            {
145
                rowValues.Add(_headers[i], DBNull.Value);
16,432✔
146
            }
147
            else
148
            {
149
                var hackedValue = _hackValuesFunc(lineToPush[i]);
27,204✔
150

151
                if (hackedValue is string value)
27,204✔
152
                    hackedValue = value.Trim();
27,204✔
153

154
                try
155
                {
156
                    if (hackedValue is string s &&
27,204✔
157
                        typeDeciderFactory.Dictionary.TryGetValue(dt.Columns[_headers[i]].DataType, out var decider))
27,204✔
158
                        hackedValue = decider.Parse(s);
7,664✔
159

160
                    rowValues.Add(_headers[i], hackedValue);
27,204✔
161
                }
27,204✔
162
                catch (Exception e)
×
163
                {
164
                    throw new FileLoadException(
×
165
                        $"Error reading file '{dt.TableName}'.  Problem loading value {lineToPush[i]} into data table (on Line number {reader.Context.Parser.RawRow}) the header we were trying to populate was {_headers[i]} and was of datatype {dt.Columns[_headers[i]].DataType}",
×
166
                        e);
×
167
                }
168
            }
169
        }
170

171
        if (!BadLines.Contains(reader.Context.Parser.RawRow))
3,118✔
172
        {
173
            var currentRow = dt.Rows.Add();
3,094✔
174
            foreach (var kvp in rowValues)
93,252✔
175
                currentRow[kvp.Key] = kvp.Value;
43,532✔
176

177
            return 1;
3,094✔
178
        }
179

180
        return 0;
24✔
181
    }
182

183
    private bool DealWithTooFewCellsOnCurrentLine(CsvReader reader, FlatFileLine lineToPush,
184
        IDataLoadEventListener listener, FlatFileEventHandlers eventHandlers)
185
    {
186
        if (!_attemptToResolveNewlinesInRecords)
38✔
187
        {
188
            //we read too little cell count but we don't want to solve the problem
189
            listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning,
20!
190
                $"Too few columns on line {reader.Context.Parser.RawRow} of file '{_fileToLoad}', it has too many columns (expected {_headers.Length} columns but line had {lineToPush.Cells.Length}).{(_bufferOverrunsWhereColumnValueWasBlank > 0 ? $"( {_bufferOverrunsWhereColumnValueWasBlank} Previously lines also suffered from buffer overruns but the overrunning values were empty so we had ignored them up until now)" : "")}"));
20✔
191
            eventHandlers.BadDataFound(lineToPush);
20✔
192

193
            //didn't bother trying to fix the problem
194
            return false;
12✔
195
        }
196

197
        //We want to try to fix the problem by reading more data
198

199
        //Create a composite row
200
        var newCells = new List<string>(lineToPush.Cells);
18✔
201

202
        //track what we are Reading incase it doesn't work
203
        var allPeekedLines = new List<FlatFileLine>();
18✔
204

205
        do
206
        {
207
            FlatFileLine peekedLine;
208

209
            //try adding the next row
210
            if (reader.Read())
28✔
211
            {
212
                peekedLine = new FlatFileLine(reader.Context);
24✔
213

214
                //peeked line was 'valid' on its own
215
                if (peekedLine.Cells.Length >= _headers.Length)
24✔
216
                {
217
                    //queue it for reprocessing
218
                    PeekedRecord = peekedLine;
12✔
219

220
                    //and mark everything else as bad
221
                    AllBad(lineToPush, allPeekedLines, eventHandlers);
12✔
222
                    return false;
6✔
223
                }
224

225
                //peeked line was invalid (too short) so we can add it onto ourselves
226
                allPeekedLines.Add(peekedLine);
12✔
227
            }
228
            else
229
            {
230
                //Ran out of space in the file without fixing the problem so it's all bad
231
                AllBad(lineToPush, allPeekedLines, eventHandlers);
4✔
232

233
                //couldn't fix the problem
234
                return false;
2✔
235
            }
236

237
            //add the peeked line to the current cells
238
            //add the first record as an extension of the last cell in current row
239
            if (peekedLine.Cells.Length != 0)
12!
240
                newCells[^1] += Environment.NewLine + peekedLine.Cells[0];
12✔
241
            else
242
                newCells[^1] += Environment.NewLine; //the next line was completely blank! just add a new line
×
243

244
            //add any further cells on after that
245
            newCells.AddRange(peekedLine.Cells.Skip(1));
12✔
246
        } while (newCells.Count < _headers.Length);
12✔
247

248

249
        //if we read too much or reached the end of the file
250
        if (newCells.Count > _headers.Length)
2!
251
        {
252
            AllBadExceptLastSoRequeueThatOne(lineToPush, allPeekedLines, eventHandlers);
×
253
            return false;
×
254
        }
255

256
        if (newCells.Count != _headers.Length)
2!
257
            throw new Exception("We didn't over read or reach end of file, how did we get here?");
×
258

259
        //we managed to create a full row
260
        lineToPush.Cells = newCells.ToArray();
2✔
261

262
        //problem was fixed
263
        return true;
2✔
264
    }
265

266

267
    public DataTable StronglyTypeTable(DataTable workingTable, ExplicitTypingCollection explicitTypingCollection)
268
    {
269
        var deciders = new Dictionary<int, IDecideTypesForStrings>();
116✔
270
        var factory = new TypeDeciderFactory(_culture);
116✔
271

272
        if (!string.IsNullOrWhiteSpace(_explicitDateTimeFormat))
116!
273
            factory.Settings.ExplicitDateFormats = new[] { _explicitDateTimeFormat };
×
274

275
        var dtCloned = workingTable.Clone();
116✔
276
        dtCloned.BeginLoadData();
116✔
277
        var typeChangeNeeded = false;
116✔
278

279
        foreach (DataColumn col in workingTable.Columns)
888✔
280
        {
281
            //if we have already handled it
282
            if (explicitTypingCollection != null &&
328✔
283
                explicitTypingCollection.ExplicitTypesCSharp.ContainsKey(col.ColumnName))
328✔
284
                continue;
285

286
            //let's make a decision about the data type to use based on the contents
287
            var computedType = new Guesser();
324✔
288
            computedType.AdjustToCompensateForValues(col);
324✔
289

290
            //Type based on the contents of the column
291
            if (computedType.ShouldDowngradeColumnTypeToMatchCurrentEstimate(col))
324✔
292
            {
293
                dtCloned.Columns[col.ColumnName].DataType = computedType.Guess.CSharpType;
170✔
294

295
                //if we have a type decider to parse this data type
296
                if (factory.IsSupported(computedType.Guess.CSharpType))
170✔
297
                    deciders.Add(col.Ordinal,
170✔
298
                        factory.Create(computedType.Guess.CSharpType)); //record column index and parser
170✔
299

300
                typeChangeNeeded = true;
170✔
301
            }
302
        }
303

304
        if (typeChangeNeeded)
116✔
305
        {
306
            foreach (DataRow row in workingTable.Rows)
3,968✔
307
                dtCloned.Rows.Add(row.ItemArray.Select((v, idx) =>
1,874✔
308
                    deciders.TryGetValue(idx, out var decider) && v is string s ? decider.Parse(s) : v).ToArray());
6,130✔
309

310
            return dtCloned;
110✔
311
        }
312

313
        return workingTable;
6✔
314
    }
315

316
    private void AllBadExceptLastSoRequeueThatOne(FlatFileLine lineToPush, List<FlatFileLine> allPeekedLines,
317
        FlatFileEventHandlers eventHandlers)
318
    {
319
        //the current line is bad
320
        eventHandlers.BadDataFound(lineToPush);
×
321

322
        //last line resulted in the overrun so requeue it
323
        PeekedRecord = allPeekedLines.Last();
×
324

325
        //but throw away everything else we read
326
        foreach (var line in allPeekedLines.Take(allPeekedLines.Count - 1))
×
327
            eventHandlers.BadDataFound(line);
×
328
    }
×
329

330
    private static void AllBad(FlatFileLine lineToPush, List<FlatFileLine> allPeekedLines,
331
        FlatFileEventHandlers eventHandlers)
332
    {
333
        //the current line is bad
334
        eventHandlers.BadDataFound(lineToPush);
16✔
335

336
        foreach (var line in allPeekedLines)
24✔
337
            eventHandlers.BadDataFound(line);
4✔
338
    }
8✔
339
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc