• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

keplergl / kepler.gl / 13395431770

18 Feb 2025 04:29PM UTC coverage: 66.175% (-0.3%) from 66.434%
13395431770

push

github

web-flow
[feat] improvements to duckDB column type handling (#2970)

This PR intends to preserve column types between different types of ingestion into Kepler and DuckDb

- timestamps stored as strings from Arrow tables are recognized as timestamps. 
- apply extra metadata from table.schema.metadata (geoparquet files). 
- DuckDB geometry is automatically casted to WKB, and properly marked with geoarrow extensions.
- DuckDB column types and query result Arrow table types consolidation.
- Apply extra logic only to the last select query.
- geoarrow constants to constants module
- add getSampleForTypeAnalyzeArrow to support and not fail for arrow data
- arrowSchemaToFields accepts extra info from DuckDB table schemas. JSON type gets GEOMETRY_FROM_STRING type, GEOMETRY with geoarrow metadata gets GEOMETRY type, timestamp ...
- fix in validateInputData - check analyzerType only for current field
- fix in validateInputData - support arrow input data

---------

Signed-off-by: Ihor Dykhta <dikhta.igor@gmail.com>

6024 of 10612 branches covered (56.77%)

Branch coverage included in aggregate %.

10 of 94 new or added lines in 8 files covered. (10.64%)

1 existing line in 1 file now uncovered.

12368 of 17181 relevant lines covered (71.99%)

88.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.56
/src/utils/src/dataset-utils.ts
1
// SPDX-License-Identifier: MIT
2
// Copyright contributors to the kepler.gl project
3

4
import * as arrow from 'apache-arrow';
5
import {
6
  ALL_FIELD_TYPES,
7
  FIELD_OPTS,
8
  TOOLTIP_FORMATS,
9
  TOOLTIP_FORMAT_TYPES
10
} from '@kepler.gl/constants';
11
import {
12
  getSampleForTypeAnalyze,
13
  getSampleForTypeAnalyzeArrow,
14
  getFieldsFromData
15
} from '@kepler.gl/common-utils';
16
import {Analyzer} from 'type-analyzer';
17
import assert from 'assert';
18

19
import {
20
  ProcessorResult,
21
  RGBColor,
22
  Field,
23
  FieldPair,
24
  TimeLabelFormat,
25
  TooltipFields,
26
  ProtoDataset
27
} from '@kepler.gl/types';
28
import {TooltipFormat} from '@kepler.gl/constants';
29
import {notNullorUndefined, h3IsValid} from '@kepler.gl/common-utils';
30

31
import {isPlainObject} from './utils';
32
import {getFormatter} from './data-utils';
33
import {getFormatValue} from './format';
34
import {hexToRgb} from './color-utils';
35

36
// apply a color for each dataset
37
// to use as label colors
38
const datasetColors = [
15✔
39
  '#8F2FBF',
40
  '#005CFF',
41
  '#C06C84',
42
  '#F8B195',
43
  '#547A82',
44
  '#3EACA8',
45
  '#A2D4AB'
46
].map(hexToRgb);
47

48
/**
49
 * Random color generator
50
 */
51
function* generateColor(): Generator<RGBColor> {
52
  let index = 0;
3✔
53
  while (index < datasetColors.length + 1) {
3✔
54
    if (index === datasetColors.length) {
102✔
55
      index = 0;
14✔
56
    }
57
    yield datasetColors[index++];
102✔
58
  }
59
}
60

61
export const datasetColorMaker = generateColor();
15✔
62

63
/**
64
 * Field name prefixes and suffixes which should not be considered
65
 * as metrics. Fields will still be included if a 'metric word'
66
 * is found on the field name, however.
67
 */
68
const EXCLUDED_DEFAULT_FIELDS = [
15✔
69
  // Serial numbers and identification numbers
70
  '_id',
71
  'id',
72
  'index',
73
  'uuid',
74
  'guid',
75
  'uid',
76
  'gid',
77
  'serial',
78
  // Geographic IDs are unlikely to be interesting to color
79
  'zip',
80
  'code',
81
  'post',
82
  'region',
83
  'fips',
84
  'cbgs',
85
  'h3',
86
  's2',
87
  // Geographic coords (but not z/elevation/altitude
88
  // since that might be a metric)
89
  'lat',
90
  'lon',
91
  'lng',
92
  'latitude',
93
  'longitude',
94
  '_x',
95
  '_y'
96
];
97

98
/**
99
 * Prefixes and suffixes that indicate a field is a metric.
100
 *
101
 * Note that these are in order of preference, first being
102
 * most preferred.
103
 */
104
const METRIC_DEFAULT_FIELDS = [
15✔
105
  'metric',
106
  'value',
107
  'sum',
108
  'count',
109
  'unique',
110
  'mean',
111
  'mode',
112
  'median',
113
  'max',
114
  'min',
115
  'deviation',
116
  'variance',
117
  'p99',
118
  'p95',
119
  'p75',
120
  'p50',
121
  'p25',
122
  'p05',
123
  // Abbreviations are less preferred
124
  'cnt',
125
  'val'
126
];
127

128
/**
129
 * Choose a field to use as the default color field of a layer.
130
 *
131
 * The heuristic is:
132
 *
133
 * First, exclude fields that are on the exclusion list and don't
134
 * have names that suggest they contain metrics. Also exclude
135
 * field names that are blank.
136
 *
137
 * Next, look for a field that is of real type and contains one
138
 * of the preferred names (in order of the preferred names).
139
 *
140
 * Next, look for a field that is of integer type and contains
141
 * one of the preferred names (in order of the preferred names).
142
 *
143
 * Next, look for the first field that is of real type (in order
144
 * of field index).
145
 *
146
 * Next, look for the first field that is of integer type (in
147
 * order of field index).
148
 *
149
 * It's possible no field will be chosen (i.e. because all fields
150
 * are strings.)
151
 *
152
 * @param dataset
153
 */
154
export function findDefaultColorField({
155
  fields,
156
  fieldPairs = []
×
157
}: {
158
  fields: Field[];
159
  fieldPairs: FieldPair[];
160
}): null | Field {
161
  const fieldsWithoutExcluded = fields.filter(field => {
61✔
162
    if (field.type !== ALL_FIELD_TYPES.real && field.type !== ALL_FIELD_TYPES.integer) {
497✔
163
      // Only select numeric fields.
164
      return false;
245✔
165
    }
166
    if (
252✔
167
      fieldPairs.find(
168
        pair => pair.pair.lat.value === field.name || pair.pair.lng.value === field.name
358✔
169
      )
170
    ) {
171
      // Do not permit lat, lon fields
172
      return false;
200✔
173
    }
174

175
    const normalizedFieldName = field.name.toLowerCase();
52✔
176
    if (normalizedFieldName === '') {
52!
177
      // Special case excluded name when the name is blank.
178
      return false;
×
179
    }
180
    const hasExcluded = EXCLUDED_DEFAULT_FIELDS.find(
52✔
181
      f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f)
629✔
182
    );
183
    const hasInclusion = METRIC_DEFAULT_FIELDS.find(
52✔
184
      f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f)
912✔
185
    );
186
    return !hasExcluded || hasInclusion;
52✔
187
  });
188

189
  const sortedFields = fieldsWithoutExcluded.sort((left, right) => {
61✔
190
    const normalizedLeft = left.name.toLowerCase();
10✔
191
    const normalizedRight = right.name.toLowerCase();
10✔
192
    const leftHasInclusion = METRIC_DEFAULT_FIELDS.findIndex(
10✔
193
      f => normalizedLeft.startsWith(f) || normalizedLeft.endsWith(f)
200✔
194
    );
195
    const rightHasInclusion = METRIC_DEFAULT_FIELDS.findIndex(
10✔
196
      f => normalizedRight.startsWith(f) || normalizedRight.endsWith(f)
72✔
197
    );
198
    if (leftHasInclusion !== rightHasInclusion) {
10✔
199
      if (leftHasInclusion === -1) {
8!
200
        // Elements that do not have the inclusion list should go after those that do.
201
        return 1;
8✔
202
      } else if (rightHasInclusion === -1) {
×
203
        // Elements that do have the inclusion list should go before those that don't.
204
        return -1;
×
205
      }
206
      // Compare based on order in the inclusion list
207
      return leftHasInclusion - rightHasInclusion;
×
208
    }
209

210
    // Compare based on type
211
    if (left.type !== right.type) {
2!
212
      if (left.type === ALL_FIELD_TYPES.real) {
×
213
        return -1;
×
214
      }
215
      // left is an integer and right is not
216
      // and reals come before integers
217
      return 1;
×
218
    }
219

220
    // Finally, order based on the order in the datasets columns
221
    // @ts-expect-error
222
    return left.index - right.index;
2✔
223
  });
224

225
  if (sortedFields.length) {
61✔
226
    // There was a best match
227
    return sortedFields[0];
15✔
228
  }
229
  // No matches
230
  return null;
46✔
231
}
232

233
/**
234
 * Validate input data, adding missing field types, rename duplicate columns
235
 */
236
export function validateInputData(data: ProtoDataset['data']): ProcessorResult {
237
  if (!isPlainObject(data)) {
161✔
238
    assert('addDataToMap Error: dataset.data cannot be null');
2✔
239
    return null;
2✔
240
  } else if (!Array.isArray(data.fields)) {
159✔
241
    assert('addDataToMap Error: expect dataset.data.fields to be an array');
3✔
242
    return null;
3✔
243
  } else if (!Array.isArray(data.rows)) {
156✔
244
    assert('addDataToMap Error: expect dataset.data.rows to be an array');
1✔
245
    return null;
1✔
246
  }
247

248
  const {fields, rows, cols} = data;
155✔
249

250
  // check if all fields has name, format and type
251
  const allValid = fields.every((f, i) => {
155✔
252
    if (!isPlainObject(f)) {
1,223✔
253
      assert(`fields needs to be an array of object, but find ${typeof f}`);
1✔
254
      fields[i] = {name: `column_${i}`, type: ALL_FIELD_TYPES.string};
1✔
255
    }
256

257
    if (!f.name) {
1,223✔
258
      assert(`field.name is required but missing in ${JSON.stringify(f)}`);
2✔
259
      // assign a name
260
      fields[i].name = `column_${i}`;
2✔
261
    }
262

263
    if (!f.type || !ALL_FIELD_TYPES[f.type]) {
1,223✔
264
      assert(`unknown field type ${f.type}`);
2✔
265
      return false;
2✔
266
    }
267

268
    if (!f.analyzerType) {
1,221✔
269
      assert(`field ${i} missing analyzerType`);
14✔
270
      return false;
14✔
271
    }
272

273
    // check time format is correct based on first 10 not empty element
274
    if (f.type === ALL_FIELD_TYPES.timestamp) {
1,207✔
275
      const sample = (
240✔
276
        cols ? findNonEmptyRowsAtFieldArrow(cols, i, 10) : findNonEmptyRowsAtField(rows, i, 10)
240!
277
      ).map(r => ({ts: r[i]}));
2,313✔
278
      const analyzedType = Analyzer.computeColMeta(sample)[0];
240✔
279
      return analyzedType && analyzedType.category === 'TIME' && analyzedType.format === f.format;
240✔
280
    }
281

282
    // check existing string field is H3 type
283
    if (f.type === ALL_FIELD_TYPES.string) {
967✔
284
      const sample = (
152✔
285
        cols ? findNonEmptyRowsAtFieldArrow(cols, i, 10) : findNonEmptyRowsAtField(rows, i, 10)
152!
286
      ).map(r => r[i]);
1,202✔
287
      return sample.every(item => !h3IsValid(item));
1,202✔
288
    }
289

290
    return true;
815✔
291
  });
292

293
  if (allValid) {
155✔
294
    return {rows, fields, cols};
139✔
295
  }
296

297
  // if any field has missing type, recalculate it for everyone
298
  // because we simply lost faith in humanity
299
  const sampleData = cols
16!
300
    ? getSampleForTypeAnalyzeArrow(
301
        cols,
NEW
302
        fields.map(f => f.name)
×
303
      )
304
    : getSampleForTypeAnalyze({
305
        fields: fields.map(f => f.name),
73✔
306
        rows
307
      });
308
  const fieldOrder = fields.map(f => f.name);
73✔
309
  const meta = getFieldsFromData(sampleData, fieldOrder);
16✔
310
  const updatedFields = fields.map((f, i) => ({
73✔
311
    ...f,
312
    type: meta[i].type,
313
    format: meta[i].format,
314
    analyzerType: meta[i].analyzerType
315
  }));
316

317
  return {fields: updatedFields, rows, ...(cols ? {cols} : {})};
16!
318
}
319

320
function findNonEmptyRowsAtField(rows: unknown[][], fieldIdx: number, total: number): any[] {
321
  const sample: any[] = [];
392✔
322
  let i = 0;
392✔
323
  while (sample.length < total && i < rows.length) {
392✔
324
    if (notNullorUndefined(rows[i]?.[fieldIdx])) {
4,523✔
325
      sample.push(rows[i]);
3,515✔
326
    }
327
    i++;
4,523✔
328
  }
329
  return sample;
392✔
330
}
331

332
function findNonEmptyRowsAtFieldArrow(
333
  cols: arrow.Vector[],
334
  fieldIdx: number,
335
  total: number
336
): any[] {
NEW
337
  const sample: any[] = [];
×
NEW
338
  const numRows = cols[fieldIdx].length;
×
NEW
339
  let i = 0;
×
NEW
340
  while (sample.length < total && i < numRows) {
×
NEW
341
    if (notNullorUndefined(cols[fieldIdx].get(i))) {
×
NEW
342
      const row = cols.map(col => col.get(i));
×
NEW
343
      sample.push(row);
×
344
    }
NEW
345
    i++;
×
346
  }
NEW
347
  return sample;
×
348
}
349

350
const TIME_DISPLAY = '2020-05-11 14:00';
15✔
351

352
export const addTimeLabel = (formats: TimeLabelFormat[]) =>
15✔
353
  formats.map(f => ({
1,548✔
354
    ...f,
355
    label:
356
      f.type === TOOLTIP_FORMAT_TYPES.DATE_TIME || f.type === TOOLTIP_FORMAT_TYPES.DATE
4,392✔
357
        ? getFormatter(getFormatValue(f))(TIME_DISPLAY)
358
        : f.label
359
  }));
360

361
export function getFieldFormatLabels(fieldType?: string): TooltipFormat[] {
362
  const tooltipTypes = (fieldType && FIELD_OPTS[fieldType].format.tooltip) || [];
138!
363
  const formatLabels: TimeLabelFormat[] = Object.values(TOOLTIP_FORMATS).filter(t =>
138✔
364
    tooltipTypes.includes(t.type)
4,278✔
365
  );
366
  return addTimeLabel(formatLabels);
138✔
367
}
368

369
export const getFormatLabels = (fields: TooltipFields[], fieldName: string): TooltipFormat[] => {
15✔
370
  const fieldType = fields.find(f => f.name === fieldName)?.type;
154✔
371
  return getFieldFormatLabels(fieldType);
37✔
372
};
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc