• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

HicServices / RDMP / 6245535001

20 Sep 2023 07:44AM UTC coverage: 57.013%. First build
6245535001

push

github

web-flow
8.1.0 Release (#1628)

* Bump Newtonsoft.Json from 13.0.1 to 13.0.2

Bumps [Newtonsoft.Json](https://github.com/JamesNK/Newtonsoft.Json) from 13.0.1 to 13.0.2.
- [Release notes](https://github.com/JamesNK/Newtonsoft.Json/releases)
- [Commits](https://github.com/JamesNK/Newtonsoft.Json/compare/13.0.1...13.0.2)

---
updated-dependencies:
- dependency-name: Newtonsoft.Json
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump NLog from 5.0.5 to 5.1.0

Bumps [NLog](https://github.com/NLog/NLog) from 5.0.5 to 5.1.0.
- [Release notes](https://github.com/NLog/NLog/releases)
- [Changelog](https://github.com/NLog/NLog/blob/dev/CHANGELOG.md)
- [Commits](https://github.com/NLog/NLog/compare/v5.0.5...v5.1.0)

---
updated-dependencies:
- dependency-name: NLog
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump NLog from 5.0.5 to 5.1.0

* Fix -r flag - should have been --results-directory all along

* Bump Newtonsoft.Json from 13.0.1 to 13.0.2

* Bump YamlDotNet from 12.0.2 to 12.1.0

Bumps [YamlDotNet](https://github.com/aaubry/YamlDotNet) from 12.0.2 to 12.1.0.
- [Release notes](https://github.com/aaubry/YamlDotNet/releases)
- [Commits](https://github.com/aaubry/YamlDotNet/compare/v12.0.2...v12.1.0)

---
updated-dependencies:
- dependency-name: YamlDotNet
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump Moq from 4.18.2 to 4.18.3

Bumps [Moq](https://github.com/moq/moq4) from 4.18.2 to 4.18.3.
- [Release notes](https://github.com/moq/moq4/releases)
- [Changelog](https://github.com/moq/moq4/blob/main/CHANGELOG.md)
- [Commits](https://github.com/moq/moq4/compare/v4.18.2...v4.18.3)

---
updated-dependencies:
- dependency-name: Moq
... (continued)

10732 of 20257 branches covered (0.0%)

Branch coverage included in aggregate %.

48141 of 48141 new or added lines in 1086 files covered. (100.0%)

30685 of 52388 relevant lines covered (58.57%)

7387.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

47.18
/Rdmp.Core/QueryBuilding/PrimaryKeyCollisionResolver.cs
1
// Copyright (c) The University of Dundee 2018-2019
2
// This file is part of the Research Data Management Platform (RDMP).
3
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
4
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
5
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.
6

7
using System;
8
using System.Collections.Generic;
9
using System.Linq;
10
using System.Text.RegularExpressions;
11
using FAnsi.Discovery.QuerySyntax;
12
using Rdmp.Core.Curation.Data;
13
using Rdmp.Core.Curation.Data.DataLoad;
14
using Rdmp.Core.DataLoad.Triggers;
15

16
namespace Rdmp.Core.QueryBuilding;
17

18
/// <summary>
19
/// The RDMP data load engine is designed to prevent duplicate data entering your live database.  This is achieved by requiring a primary key defined by the source
20
/// data (i.e. not an autonum).  However it is expected that semantically correct primary keys will not be perfectly supplied in all cases by data providers, for example
21
/// if 'TestLabCode' is the primary key on biochemistry but duplicates appear with unique values in 'DataAge' it would be reasonable to assume that newer 'DataAge' records
22
/// replace older ones.  Therefore we might decide to keep the primary key as 'TestLabCode' and then discard duplicate records based on preserving the latest 'DataAge'.
23
/// 
24
/// <para>This class handles creating the query that deletes duplicates based on the column preference order supplied (See ConfigurePrimaryKeyCollisionResolution). </para>
25
/// </summary>
26
public class PrimaryKeyCollisionResolver
27
{
28
    private readonly ITableInfo _tableInfo;
29
    private readonly IQuerySyntaxHelper _querySyntaxHelper;
30
    private const string WithCTE = "WITH CTE (DuplicateCount)";
31
    private const string SelectRownum = "\t SELECT ROW_NUMBER()";
32

33
    private const string DeleteBit =
34
        @"DELETE 
35
FROM CTE 
36
WHERE DuplicateCount > 1";
37

38
    /// <summary>
39
    /// Creates a new collision resolver using the primary keys and resolution order of the supplied <see cref="TableInfo"/>
40
    /// </summary>
41
    /// <param name="tableInfo"></param>
42
    public PrimaryKeyCollisionResolver(ITableInfo tableInfo)
10✔
43
    {
44
        _tableInfo = tableInfo;
10✔
45
        _querySyntaxHelper = _tableInfo.GetQuerySyntaxHelper();
10✔
46
    }
10✔
47

48
    /// <summary>
49
    /// Get the SQL to run to delete records colliding on primary key
50
    /// </summary>
51
    /// <returns></returns>
52
    public string GenerateSQL() => GenerateSQL(out _, out _);
10✔
53

54
    private string GenerateSQL(out ColumnInfo[] pks, out List<IResolveDuplication> resolvers)
55
    {
56
        var tableNameInRAW = GetTableName();
10✔
57

58
        var cols = _tableInfo.ColumnInfos.ToArray();
10✔
59
        pks = cols.Where(col => col.IsPrimaryKey).ToArray();
40✔
60

61
        if (!pks.Any())
10✔
62
            throw new Exception(
4✔
63
                $"TableInfo {_tableInfo.GetRuntimeName()} does not have any primary keys defined so cannot resolve primary key collisions");
4✔
64

65
        var primaryKeys = pks.Aggregate("", (s, n) =>
6✔
66
            $"{s}{_querySyntaxHelper.EnsureWrapped(n.GetRuntimeName(LoadStage.AdjustRaw))},");
12✔
67
        primaryKeys = primaryKeys.TrimEnd(',');
6✔
68

69

70
        var sql =
6✔
71
            $"/*Notice how entities are not fully indexed with Database, this is because this code will run on RAW servers, prior to reaching STAGING/LIVE - the place where there are primary keys*/{Environment.NewLine}{WithCTE}{Environment.NewLine}AS{Environment.NewLine}({Environment.NewLine}{SelectRownum} OVER({Environment.NewLine}\t PARTITION BY{Environment.NewLine}\t\t {primaryKeys}{Environment.NewLine}\t ORDER BY{Environment.NewLine}\t /*Priority in which order they should be used to resolve duplication of the primary key values, order by:*/{Environment.NewLine}";
6✔
72

73
        resolvers = new List<IResolveDuplication>();
6✔
74

75
        resolvers.AddRange(cols.Where(c => c.DuplicateRecordResolutionOrder != null));
24✔
76
        resolvers.AddRange(_tableInfo.PreLoadDiscardedColumns.Where(c => c.DuplicateRecordResolutionOrder != null));
6✔
77

78
        if (!resolvers.Any())
6✔
79
            throw new Exception(
2✔
80
                $"The ColumnInfos of TableInfo {_tableInfo} do not have primary key resolution orders configured (do not know which order to use non primary key column values in to resolve collisions).  Fix this by right clicking a TableInfo in CatalogueManager and selecting 'Configure Primary Key Collision Resolution'.");
2✔
81

82
        //order by the priority of columns
83
        foreach (var column in resolvers.OrderBy(col => col.DuplicateRecordResolutionOrder))
32✔
84
        {
85
            if (column is ColumnInfo { IsPrimaryKey: true })
8!
86
                throw new Exception(
×
87
                    $"Column {column.GetRuntimeName()} is flagged as primary key when it also has a DuplicateRecordResolutionOrder, primary keys cannot be used to resolve duplication since they are the hash!  Resolve this in the CatalogueManager by right clicking the offending TableInfo {_tableInfo.GetRuntimeName()} and editing the resolution order");
×
88

89
            sql = AppendRelevantOrderBySql(sql, column);
8✔
90
        }
91

92
        //trim the last remaining open bracket
93
        sql =
4✔
94
            $"{sql.TrimEnd(',', '\r', '\n')}{Environment.NewLine}) AS DuplicateCount{Environment.NewLine}FROM {tableNameInRAW}{Environment.NewLine}){Environment.NewLine}{DeleteBit}";
4✔
95

96
        return sql;
4✔
97
    }
98

99
    private string GetTableName() => _tableInfo.GetRuntimeName(LoadStage.AdjustRaw);
10✔
100

101
    private string AppendRelevantOrderBySql(string sql, IResolveDuplication col)
102
    {
103
        var colname = _querySyntaxHelper.EnsureWrapped(col.GetRuntimeName(LoadStage.AdjustRaw));
8✔
104

105
        var direction = col.DuplicateRecordResolutionIsAscending ? " ASC" : " DESC";
8✔
106

107
        //don't bother adding these because they are hic generated
108
        if (SpecialFieldNames.IsHicPrefixed(colname))
8!
109
            return sql;
×
110

111
        var valueType = GetDataType(col.Data_type);
8✔
112

113
        if (valueType == ValueType.CharacterString)
8!
114
            //character strings are compared first by LENGTH (to prefer longer data)
115
            //then by alphabetical comparison to prefer things towards the start of the alphabet (because this makes sense?!)
116
            return
×
117
                $"{sql}LEN(ISNULL({colname},{GetNullSubstituteForComparisonsWithDataType(col.Data_type, true)})){direction},{Environment.NewLine}ISNULL({colname},{GetNullSubstituteForComparisonsWithDataType(col.Data_type, true)}){direction},{Environment.NewLine}";
×
118

119
        return
8✔
120
            $"{sql}ISNULL({colname},{GetNullSubstituteForComparisonsWithDataType(col.Data_type, true)}){direction},{Environment.NewLine}";
8✔
121
    }
122

123
    /// <summary>
124
    /// Generates the SQL that will be run to determine whether there are any record collisions on primary key (in RAW)
125
    /// </summary>
126
    /// <returns></returns>
127
    public string GenerateCollisionDetectionSQL()
128
    {
129
        var tableNameInRAW = GetTableName();
×
130
        var pks = _tableInfo.ColumnInfos.Where(col => col.IsPrimaryKey).ToArray();
×
131

132
        var sql = "";
×
133
        sql += $"select case when exists({Environment.NewLine}";
×
134
        sql += $"select 1 FROM{Environment.NewLine}";
×
135
        sql += tableNameInRAW + Environment.NewLine;
×
136
        sql +=
×
137
            $"group by {pks.Aggregate("", (s, n) => $"{s}{_querySyntaxHelper.EnsureWrapped(n.GetRuntimeName(LoadStage.AdjustRaw))},")}{Environment.NewLine}";
×
138
        sql = sql.TrimEnd(new[] { ',', '\r', '\n' }) + Environment.NewLine;
×
139
        sql += $"having count(*) > 1{Environment.NewLine}";
×
140
        sql += $") then 1 else 0 end{Environment.NewLine}";
×
141

142
        return sql;
×
143
    }
144

145
    /// <summary>
146
    /// Generates SQL to show which records would be deleted by primary key collision resolution.  This should be run manually by the data analyst if he is unsure about the
147
    /// resolution order / current primary keys
148
    /// </summary>
149
    /// <returns></returns>
150
    public string GeneratePreviewSQL()
151
    {
152
        var basicSQL = GenerateSQL(out var pks, out var resolvers);
×
153

154
        var commaSeparatedPKs = string.Join(",",
×
155
            pks.Select(c => _querySyntaxHelper.EnsureWrapped(c.GetRuntimeName(LoadStage.AdjustRaw))));
×
156
        var commaSeparatedCols = string.Join(",",
×
157
            resolvers.Select(c => _querySyntaxHelper.EnsureWrapped(c.GetRuntimeName(LoadStage.AdjustRaw))));
×
158

159
        //add all the columns to the WITH CTE bit
160
        basicSQL = basicSQL.Replace(WithCTE, $"WITH CTE ({commaSeparatedPKs},{commaSeparatedCols},DuplicateCount)");
×
161
        basicSQL = basicSQL.Replace(SelectRownum, $"\t SELECT {commaSeparatedPKs},{commaSeparatedCols},ROW_NUMBER()");
×
162
        basicSQL = basicSQL.Replace(DeleteBit, "");
×
163

164
        basicSQL += $"select{Environment.NewLine}";
×
165
        basicSQL +=
×
166
            $"\tCase when DuplicateCount = 1 then 'Retained' else 'Deleted' end as PlannedOperation,*{Environment.NewLine}";
×
167
        basicSQL += $"FROM CTE{Environment.NewLine}";
×
168
        basicSQL += $"where{Environment.NewLine}";
×
169
        basicSQL += $"exists{Environment.NewLine}";
×
170
        basicSQL += $"({Environment.NewLine}";
×
171
        basicSQL += $"\tselect 1{Environment.NewLine}";
×
172
        basicSQL += $"\tfrom{Environment.NewLine}";
×
173
        basicSQL += $"\t\t{GetTableName()} child{Environment.NewLine}";
×
174
        basicSQL += $"\twhere{Environment.NewLine}";
×
175

176
        //add the child.pk1 = CTE.pk1 bit to restrict preview only to rows that are going to get compared for nukage
177
        basicSQL += string.Join("\r\n\t\tand", pks.Select(pk =>
×
178
            $"\t\tchild.{_querySyntaxHelper.EnsureWrapped(pk.GetRuntimeName(LoadStage.AdjustRaw))}= CTE.{_querySyntaxHelper.EnsureWrapped(pk.GetRuntimeName(LoadStage.AdjustRaw))}"));
×
179

180
        basicSQL += $"\tgroup by{Environment.NewLine}";
×
181
        basicSQL += string.Join(",\r\n", pks.Select(pk =>
×
182
            $"\t\t{_querySyntaxHelper.EnsureWrapped(pk.GetRuntimeName(LoadStage.AdjustRaw))}"));
×
183

184
        basicSQL += $"\t\t{Environment.NewLine}";
×
185
        basicSQL += $"\thaving count(*)>1{Environment.NewLine}";
×
186
        basicSQL += $"){Environment.NewLine}";
×
187

188
        basicSQL +=
×
189
            $"order by {string.Join(",\r\n", pks.Select(pk => _querySyntaxHelper.EnsureWrapped(pk.GetRuntimeName(LoadStage.AdjustRaw))))}";
×
190
        basicSQL += ",DuplicateCount";
×
191

192
        return basicSQL;
×
193
    }
194

195
    private static ValueType GetDataType(string dataType)
196
    {
197
        if (
8!
198
            dataType.StartsWith("decimal") ||
8✔
199
            dataType.StartsWith("float") ||
8✔
200
            dataType.Equals("bigint") ||
8✔
201
            dataType.Equals("bit") ||
8✔
202
            dataType.Contains("decimal") ||
8✔
203
            dataType.Equals("int") ||
8✔
204
            dataType.Equals("money") ||
8✔
205
            dataType.Contains("numeric") ||
8✔
206
            dataType.Equals("smallint") ||
8✔
207
            dataType.Equals("smallmoney") ||
8✔
208
            dataType.Equals("smallint") ||
8✔
209
            dataType.Equals("tinyint") ||
8✔
210
            dataType.Equals("real"))
8✔
211
            return ValueType.Numeric;
8✔
212

213
        if (dataType.Contains("date"))
×
214
            return ValueType.DateTime;
×
215

216
        if (dataType.Contains("time"))
×
217
            return ValueType.Time;
×
218

219
        if (dataType.Contains("char") || dataType.Contains("text"))
×
220
            return ValueType.CharacterString;
×
221

222
        if (dataType.Contains("binary") || dataType.Contains("image"))
×
223
            return ValueType.Binary;
×
224

225
        return dataType.Equals("cursor") ||
×
226
               dataType.Contains("timestamp") ||
×
227
               dataType.Contains("hierarchyid") ||
×
228
               dataType.Contains("uniqueidentifier") ||
×
229
               dataType.Contains("sql_variant") ||
×
230
               dataType.Contains("xml") ||
×
231
               dataType.Contains("table") ||
×
232
               dataType.Contains("spacial")
×
233
            ? ValueType.Freaky
×
234
            : throw new Exception($"Could not figure out the ValueType of SQL Type \"{dataType}\"");
×
235
    }
236

237
    /// <summary>
238
    /// When using ORDER BY to resolve primary key collisions this will specify what substitution to use for null values (such that the ORDER BY works correctly).
239
    /// </summary>
240
    /// <param name="datatype">The Sql Server column datatype for the column you are substituting</param>
241
    /// <param name="min">true to substitute null values for the minimum value of the <paramref name="datatype"/>, false to substitute for the maximum</param>
242
    /// <returns></returns>
243
    public static string GetNullSubstituteForComparisonsWithDataType(string datatype, bool min)
244
    {
245
        //technically these can go lower (real and float) but how realistic is that espcially when SqlServer plays fast and loose with very small numbers in floats...
246
        if (datatype.Equals("bigint") || datatype.Equals("real") || datatype.StartsWith("float"))
24✔
247
            return min ? "-9223372036854775808" : "9223372036854775807";
4!
248

249
        if (datatype.Equals("int"))
20✔
250
            return min ? "-2147483648" : "2147483647";
4!
251

252
        if (datatype.Equals("smallint"))
16!
253
            return min ? "-32768" : "32767";
×
254

255
        if (datatype.Equals("tinyint"))
16!
256
            return min ? "- 1.79E+308" : "255";
×
257

258
        if (datatype.Equals("bit"))
16!
259
            return min ? "0" : "1";
×
260

261
        if (datatype.Contains("decimal") || datatype.Contains("numeric"))
16!
262
        {
263
            var digits = Regex.Match(datatype, @"(\d+),?(\d+)?");
16✔
264
            var toReturn = "";
16✔
265

266
            if (min)
16✔
267
                toReturn = "-";
8✔
268

269
            //ignore element zero because element zero is always a duplicate see https://msdn.microsoft.com/en-us/library/system.text.regularexpressions.match.groups%28v=vs.110%29.aspx
270
            if (digits.Groups.Count == 3 && string.IsNullOrWhiteSpace(digits.Groups[2].Value))
16✔
271
            {
272
                for (var i = 0; i < Convert.ToInt32(digits.Groups[1].Value); i++)
120✔
273
                    toReturn += "9";
52✔
274

275
                return toReturn;
8✔
276
            }
277

278
            if (digits.Groups.Count == 3)
8!
279
            {
280
                var totalDigits = Convert.ToInt32(digits.Groups[1].Value);
8✔
281
                var digitsAfterDecimal = Convert.ToInt32(digits.Groups[2].Value);
8✔
282

283
                for (var i = 0; i < totalDigits + 1; i++)
88✔
284
                    if (i == totalDigits - digitsAfterDecimal)
36✔
285
                        toReturn += ".";
8✔
286
                    else
287
                        toReturn += "9";
28✔
288

289
                return toReturn;
8✔
290
            }
291
        }
292

293
        var valueType = GetDataType(datatype);
×
294

295
        return valueType switch
×
296
        {
×
297
            ValueType.CharacterString when min => "''",
×
298
            ValueType.CharacterString => throw new NotSupportedException(
×
299
                "Cannot think what the maximum character string would be, maybe use min = true instead?"),
×
300
            ValueType.DateTime when min => "'1753-1-1'",
×
301
            ValueType.DateTime => throw new NotSupportedException(
×
302
                "Cannot think what the maximum date would be, maybe use min = true instead?"),
×
303
            ValueType.Time => min ? "'00:00:00'" : "'23:59:59'",
×
304
            ValueType.Freaky => throw new NotSupportedException(
×
305
                $"Cannot predict null value substitution for freaky data types like {datatype}"),
×
306
            ValueType.Binary => throw new NotSupportedException(
×
307
                $"Cannot predict null value substitution for binary data types like {datatype}"),
×
308
            _ => throw new NotSupportedException($"Didn't know what minimum value type to use for {datatype}")
×
309
        };
×
310
    }
311

312
    private enum ValueType
313
    {
314
        Numeric,
315
        DateTime,
316
        Time,
317
        CharacterString,
318
        Binary,
319
        Freaky
320
    }
321
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc