• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

keplergl / kepler.gl / 13395431770

18 Feb 2025 04:29PM UTC coverage: 66.175% (-0.3%) from 66.434%
13395431770

push

github

web-flow
[feat] improvements to duckDB column type handling (#2970)

This PR intends to preserve column types between different types of ingestion into Kepler and DuckDb

- timestamps stored as strings from Arrow tables are recognized as timestamps. 
- apply extra metadata from table.schema.metadata (geoparquet files). 
- DuckDB geometry is automatically casted to WKB, and properly marked with geoarrow extensions.
- DuckDB column types and query result Arrow table types consolidation.
- Apply extra logic only to the last select query.
- geoarrow constants to constants module
- add getSampleForTypeAnalyzeArrow to support and not fail for arrow data
- arrowSchemaToFields accepts extra info from DuckDB table schemas. JSON type gets GEOMETRY_FROM_STRING type, GEOMETRY with geoarrow metadata gets GEOMETRY type, timestamp ...
- fix in validateInputData - check analyzerType only for current field
- fix in validateInputData - support arrow input data

---------

Signed-off-by: Ihor Dykhta <dikhta.igor@gmail.com>

6024 of 10612 branches covered (56.77%)

Branch coverage included in aggregate %.

10 of 94 new or added lines in 8 files covered. (10.64%)

1 existing line in 1 file now uncovered.

12368 of 17181 relevant lines covered (71.99%)

88.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.79
/src/common-utils/src/data-type.ts
1
// SPDX-License-Identifier: MIT
2
// Copyright contributors to the kepler.gl project
3

4
import {Analyzer, DATA_TYPES as AnalyzerDATA_TYPES} from 'type-analyzer';
5
import {ArrowTableInterface, ApacheVectorInterface, RowData, Field} from '@kepler.gl/types';
6
import {ALL_FIELD_TYPES} from '@kepler.gl/constants';
7
import {console as globalConsole} from 'global/window';
8
import {range} from 'd3-array';
9
import {isHexWkb, notNullorUndefined} from './data';
10
import {h3IsValid} from './h3-utils';
11

12
const H3_ANALYZER_TYPE = 'H3';
15✔
13

14
export const ACCEPTED_ANALYZER_TYPES = [
15✔
15
  AnalyzerDATA_TYPES.DATE,
16
  AnalyzerDATA_TYPES.TIME,
17
  AnalyzerDATA_TYPES.DATETIME,
18
  AnalyzerDATA_TYPES.NUMBER,
19
  AnalyzerDATA_TYPES.INT,
20
  AnalyzerDATA_TYPES.FLOAT,
21
  AnalyzerDATA_TYPES.BOOLEAN,
22
  AnalyzerDATA_TYPES.STRING,
23
  AnalyzerDATA_TYPES.GEOMETRY,
24
  AnalyzerDATA_TYPES.GEOMETRY_FROM_STRING,
25
  AnalyzerDATA_TYPES.PAIR_GEOMETRY_FROM_STRING,
26
  AnalyzerDATA_TYPES.ZIPCODE,
27
  AnalyzerDATA_TYPES.ARRAY,
28
  AnalyzerDATA_TYPES.OBJECT,
29
  H3_ANALYZER_TYPE
30
];
31

32
const IGNORE_DATA_TYPES = Object.keys(AnalyzerDATA_TYPES).filter(
15✔
33
  type => !ACCEPTED_ANALYZER_TYPES.includes(type)
270✔
34
);
35

36
/**
37
 * Getting sample data for analyzing field type.
38
 */
39
export function getSampleForTypeAnalyze({
40
  fields,
41
  rows,
42
  sampleCount = 50
118✔
43
}: {
44
  fields: string[];
45
  rows: unknown[][] | RowData;
46
  sampleCount?: number;
47
}): RowData {
48
  const total = Math.min(sampleCount, rows.length);
119✔
49
  // const fieldOrder = fields.map(f => f.name);
50
  const sample = range(0, total, 1).map(() => ({}));
1,362✔
51

52
  if (rows.length < 1) {
119!
53
    return [];
×
54
  }
55
  const isRowObject = !Array.isArray(rows[0]);
119✔
56

57
  // collect sample data for each field
58
  fields.forEach((field, fieldIdx) => {
119✔
59
    // row counter
60
    let i = 0;
947✔
61
    // sample counter
62
    let j = 0;
947✔
63

64
    while (j < total) {
947✔
65
      if (i >= rows.length) {
15,078✔
66
        // if depleted data pool
67
        sample[j][field] = null;
1,000✔
68
        j++;
1,000✔
69
      } else if (notNullorUndefined(rows[i][isRowObject ? field : fieldIdx])) {
14,078!
70
        const value = rows[i][isRowObject ? field : fieldIdx];
13,073!
71
        sample[j][field] = typeof value === 'string' ? value.trim() : value;
13,073✔
72
        j++;
13,073✔
73
        i++;
13,073✔
74
      } else {
75
        i++;
1,005✔
76
      }
77
    }
78
  });
79

80
  return sample;
119✔
81
}
82

83
/**
84
 * Getting sample data for analyzing field type for Arrow tables.
85
 * @param table Arrow table or an array of vectors.
86
 * @param fields Field names.
87
 * @param sampleCount Number of sample rows to get.
88
 * @returns Sample rows.
89
 */
90
export function getSampleForTypeAnalyzeArrow(
91
  table: ArrowTableInterface | ApacheVectorInterface[],
92
  fields: string[],
93
  sampleCount: number = 50
×
94
): any[] {
NEW
95
  const isTable = !Array.isArray(table);
×
96

NEW
97
  const numRows = isTable ? table.numRows : table[0].length;
×
NEW
98
  const getVector = isTable ? index => table.getChildAt(index) : index => table[index];
×
99

NEW
100
  const total = Math.min(sampleCount, numRows);
×
NEW
101
  const sample = range(0, total, 1).map(() => ({}));
×
102

NEW
103
  if (numRows < 1) {
×
NEW
104
    return [];
×
105
  }
106

107
  // collect sample data for each field
NEW
108
  fields.forEach((field, fieldIdx) => {
×
NEW
109
    let rowIndex = 0;
×
NEW
110
    let sampleIndex = 0;
×
111

NEW
112
    while (sampleIndex < total) {
×
NEW
113
      if (rowIndex >= numRows) {
×
114
        // if depleted data pool
NEW
115
        sample[sampleIndex][field] = null;
×
NEW
116
        sampleIndex++;
×
NEW
117
      } else if (notNullorUndefined(getVector(fieldIdx)?.get(rowIndex))) {
×
NEW
118
        const value = getVector(fieldIdx)?.get(rowIndex);
×
NEW
119
        sample[sampleIndex][field] = typeof value === 'string' ? value.trim() : value;
×
NEW
120
        sampleIndex++;
×
NEW
121
        rowIndex++;
×
122
      } else {
NEW
123
        rowIndex++;
×
124
      }
125
    }
126
  });
127

NEW
128
  return sample;
×
129
}
130

131
/**
132
 * Convert type-analyzer output to kepler.gl field types
133
 *
134
 * @param aType
135
 * @returns corresponding type in `ALL_FIELD_TYPES`
136
 */
137
/* eslint-disable complexity */
138
export function analyzerTypeToFieldType(aType: string): string {
139
  const {
140
    DATE,
141
    TIME,
142
    DATETIME,
143
    NUMBER,
144
    INT,
145
    FLOAT,
146
    BOOLEAN,
147
    STRING,
148
    GEOMETRY,
149
    GEOMETRY_FROM_STRING,
150
    PAIR_GEOMETRY_FROM_STRING,
151
    ZIPCODE,
152
    ARRAY,
153
    OBJECT
154
  } = AnalyzerDATA_TYPES;
973✔
155

156
  // TODO: un recognized types
157
  // CURRENCY PERCENT NONE
158
  switch (aType) {
973✔
159
    case DATE:
160
      return ALL_FIELD_TYPES.date;
19✔
161
    case TIME:
162
    case DATETIME:
163
      return ALL_FIELD_TYPES.timestamp;
133✔
164
    case FLOAT:
165
      return ALL_FIELD_TYPES.real;
332✔
166
    case INT:
167
      return ALL_FIELD_TYPES.integer;
193✔
168
    case BOOLEAN:
169
      return ALL_FIELD_TYPES.boolean;
48✔
170
    case GEOMETRY:
171
    case GEOMETRY_FROM_STRING:
172
    case PAIR_GEOMETRY_FROM_STRING:
173
      return ALL_FIELD_TYPES.geojson;
70✔
174
    case ARRAY:
175
      return ALL_FIELD_TYPES.array;
21✔
176
    case OBJECT:
177
      return ALL_FIELD_TYPES.object;
10✔
178
    case NUMBER:
179
    case STRING:
180
    case ZIPCODE:
181
      return ALL_FIELD_TYPES.string;
127✔
182
    case H3_ANALYZER_TYPE:
183
      return ALL_FIELD_TYPES.h3;
16✔
184
    default:
185
      globalConsole.warn(`Unsupported analyzer type: ${aType}`);
4✔
186
      return ALL_FIELD_TYPES.string;
4✔
187
  }
188
}
189

190
/**
191
 * Analyze field types from data in `string` format, e.g. uploaded csv.
192
 * Assign `type`, `fieldIdx` and `format` (timestamp only) to each field
193
 *
194
 * @param data array of row object
195
 * @param fieldOrder array of field names as string
196
 * @returns formatted fields
197
 * @public
198
 * @example
199
 *
200
 * import {getFieldsFromData} from 'kepler.gl/common-utils';
201
 * const data = [{
202
 *   time: '2016-09-17 00:09:55',
203
 *   value: '4',
204
 *   surge: '1.2',
205
 *   isTrip: 'true',
206
 *   zeroOnes: '0'
207
 * }, {
208
 *   time: '2016-09-17 00:30:08',
209
 *   value: '3',
210
 *   surge: null,
211
 *   isTrip: 'false',
212
 *   zeroOnes: '1'
213
 * }, {
214
 *   time: null,
215
 *   value: '2',
216
 *   surge: '1.3',
217
 *   isTrip: null,
218
 *   zeroOnes: '1'
219
 * }];
220
 *
221
 * const fieldOrder = ['time', 'value', 'surge', 'isTrip', 'zeroOnes'];
222
 * const fields = getFieldsFromData(data, fieldOrder);
223
 * // fields = [
224
 * // {name: 'time', format: 'YYYY-M-D H:m:s', fieldIdx: 1, type: 'timestamp'},
225
 * // {name: 'value', format: '', fieldIdx: 4, type: 'integer'},
226
 * // {name: 'surge', format: '', fieldIdx: 5, type: 'real'},
227
 * // {name: 'isTrip', format: '', fieldIdx: 6, type: 'boolean'},
228
 * // {name: 'zeroOnes', format: '', fieldIdx: 7, type: 'integer'}];
229
 *
230
 */
231
export function getFieldsFromData(data: RowData, fieldOrder: string[]): Field[] {
232
  // add a check for epoch timestamp
233
  const metadata = Analyzer.computeColMeta(
119✔
234
    data,
235
    [
236
      {regex: /.*geojson|all_points/g, dataType: 'GEOMETRY'},
237
      {regex: /.*census/g, dataType: 'STRING'}
238
    ],
239
    {ignoredDataTypes: IGNORE_DATA_TYPES}
240
  );
241

242
  const {fieldByIndex} = renameDuplicateFields(fieldOrder);
119✔
243

244
  const result = fieldOrder.map((field, index) => {
119✔
245
    const name = fieldByIndex[index];
955✔
246

247
    const fieldMeta = metadata.find(m => m.key === field);
5,416✔
248

249
    // fieldMeta could be undefined if the field has no data and Analyzer.computeColMeta
250
    // will ignore the field. In this case, we will simply assign the field type to STRING
251
    // since dropping the column in the RowData could be expensive
252
    let type = fieldMeta?.type || 'STRING';
955✔
253
    const format = fieldMeta?.format || '';
955✔
254

255
    // quick check if first valid string in column is H3
256
    if (type === AnalyzerDATA_TYPES.STRING) {
955✔
257
      for (let i = 0, n = data.length; i < n; ++i) {
141✔
258
        if (notNullorUndefined(data[i][name])) {
146✔
259
          type = h3IsValid(data[i][name] || '') ? H3_ANALYZER_TYPE : type;
138!
260
          break;
138✔
261
        }
262
      }
263
    }
264

265
    // quick check if string is hex wkb
266
    if (type === AnalyzerDATA_TYPES.STRING) {
955✔
267
      type = data.some(d => isHexWkb(d[name])) ? AnalyzerDATA_TYPES.GEOMETRY : type;
1,472✔
268
    }
269

270
    return {
955✔
271
      name,
272
      id: name,
273
      displayName: name,
274
      format,
275
      fieldIdx: index,
276
      type: analyzerTypeToFieldType(type),
277
      analyzerType: type,
278
      valueAccessor: dc => d => {
17✔
279
        return dc.valueAt(d.index, index);
×
280
      }
281
    };
282
  });
283

284
  return result;
119✔
285
}
286

287
/**
288
 * pass in an array of field names, rename duplicated one
289
 * and return a map from old field index to new name
290
 *
291
 * @param fieldOrder
292
 * @returns new field name by index
293
 */
294
export function renameDuplicateFields(fieldOrder: string[]): {
295
  allNames: string[];
296
  fieldByIndex: string[];
297
} {
298
  return fieldOrder.reduce<{allNames: string[]; fieldByIndex: string[]}>(
119✔
299
    (accu, field, i) => {
300
      const {allNames} = accu;
955✔
301
      let fieldName = field;
955✔
302

303
      // add a counter to duplicated names
304
      if (allNames.includes(field)) {
955✔
305
        let counter = 0;
2✔
306
        while (allNames.includes(`${field}-${counter}`)) {
2✔
307
          counter++;
1✔
308
        }
309
        fieldName = `${field}-${counter}`;
2✔
310
      }
311

312
      accu.fieldByIndex[i] = fieldName;
955✔
313
      accu.allNames.push(fieldName);
955✔
314

315
      return accu;
955✔
316
    },
317
    {allNames: [], fieldByIndex: []}
318
  );
319
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc