13395431770

Committed 18 Feb 2025 04:29PM UTC coverage: 66.175% (-0.3%) from 66.434%

Build # 13395431770

Build Type

push

github

Committed by

web-flow

Commit Message

[feat] improvements to duckDB column type handling (#2970)

This PR intends to preserve column types between different types of ingestion into Kepler and DuckDb

- timestamps stored as strings from Arrow tables are recognized as timestamps. 
- apply extra metadata from table.schema.metadata (geoparquet files). 
- DuckDB geometry is automatically casted to WKB, and properly marked with geoarrow extensions.
- DuckDB column types and query result Arrow table types consolidation.
- Apply extra logic only to the last select query.
- geoarrow constants to constants module
- add getSampleForTypeAnalyzeArrow to support and not fail for arrow data
- arrowSchemaToFields accepts extra info from DuckDB table schemas. JSON type gets GEOMETRY_FROM_STRING type, GEOMETRY with geoarrow metadata gets GEOMETRY type, timestamp ...
- fix in validateInputData - check analyzerType only for current field
- fix in validateInputData - support arrow input data

---------

Signed-off-by: Ihor Dykhta <dikhta.igor@gmail.com>

Run Details

6024 of 10612 branches covered (56.77%)

Branch coverage included in aggregate %.

10 of 94 new or added lines in 8 files covered. (10.64%)

1 existing line in 1 file now uncovered.

12368 of 17181 relevant lines covered (71.99%)

88.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

55.08

/src/processors/src/data-processor.ts

// SPDX-License-Identifier: MIT
// Copyright contributors to the kepler.gl project

import * as arrow from 'apache-arrow';
import {csvParseRows} from 'd3-dsv';
import {DATA_TYPES as AnalyzerDATA_TYPES} from 'type-analyzer';
import normalize from '@mapbox/geojson-normalize';
import {parseSync} from '@loaders.gl/core';
import {ArrowTable} from '@loaders.gl/schema';
import {WKBLoader} from '@loaders.gl/wkt';

import {
  ALL_FIELD_TYPES,
  DATASET_FORMATS,
  GEOARROW_EXTENSIONS,
  GEOARROW_METADATA_KEY,
  GUIDES_FILE_FORMAT_DOC
} from '@kepler.gl/constants';
import {ProcessorResult, Field} from '@kepler.gl/types';
import {
  arrowDataTypeToAnalyzerDataType,
  arrowDataTypeToFieldType,
  hasOwnProperty,
  isPlainObject
} from '@kepler.gl/utils';
import {
  analyzerTypeToFieldType,
  getSampleForTypeAnalyze,
  getSampleForTypeAnalyzeArrow,
  getFieldsFromData,
  h3IsValid,
  notNullorUndefined,
  toArray
} from '@kepler.gl/common-utils';
import {KeplerGlSchema, ParsedDataset, SavedMap, LoadedMap} from '@kepler.gl/schemas';
import {Feature} from '@nebula.gl/edit-modes';

// if any of these value occurs in csv, parse it to null;
// const CSV_NULLS = ['', 'null', 'NULL', 'Null', 'NaN', '/N'];
// matches empty string
export const CSV_NULLS = /^(null|NULL|Null|NaN|\/N||)$/;

function tryParseJsonString(str) {
  try {
    return JSON.parse(str);
  } catch (e) {
    return null;
  }
}

export const PARSE_FIELD_VALUE_FROM_STRING = {
  [ALL_FIELD_TYPES.boolean]: {
    valid: (d: unknown): boolean => typeof d === 'boolean',
    parse: (d: unknown): boolean => d === 'true' || d === 'True' || d === 'TRUE' || d === '1'
  },
  [ALL_FIELD_TYPES.integer]: {
    // @ts-ignore
    valid: (d: unknown): boolean => parseInt(d, 10) === d,
    // @ts-ignore
    parse: (d: unknown): number => parseInt(d, 10)
  },
  [ALL_FIELD_TYPES.timestamp]: {
    valid: (d: unknown, field: Field): boolean =>
      ['x', 'X'].includes(field.format) ? typeof d === 'number' : typeof d === 'string',
    parse: (d: any, field: Field) => (['x', 'X'].includes(field.format) ? Number(d) : d)
  },
  [ALL_FIELD_TYPES.real]: {
    // @ts-ignore
    valid: (d: unknown): boolean => parseFloat(d) === d,
    // Note this will result in NaN for some string
    parse: parseFloat
  },
  [ALL_FIELD_TYPES.object]: {
    valid: isPlainObject,
    parse: tryParseJsonString
  },

  [ALL_FIELD_TYPES.array]: {
    valid: Array.isArray,
    parse: tryParseJsonString
  },

  [ALL_FIELD_TYPES.h3]: {
    valid: d => h3IsValid(d),
    parse: d => d
  }
};

/**
 * Process csv data, output a data object with `{fields: [], rows: []}`.
 * The data object can be wrapped in a `dataset` and pass to [`addDataToMap`](../actions/actions.md#adddatatomap)
 * @param rawData raw csv string
 * @returns data object `{fields: [], rows: []}` can be passed to addDataToMaps
 * @public
 * @example
 * import {processCsvData} from 'kepler.gl/processors';
 *
 * const testData = `gps_data.utc_timestamp,gps_data.lat,gps_data.lng,gps_data.types,epoch,has_result,id,time,begintrip_ts_utc,begintrip_ts_local,date
 * 2016-09-17 00:09:55,29.9900937,31.2590542,driver_analytics,1472688000000,False,1,2016-09-23T00:00:00.000Z,2016-10-01 09:41:39+00:00,2016-10-01 09:41:39+00:00,2016-09-23
 * 2016-09-17 00:10:56,29.9927699,31.2461142,driver_analytics,1472688000000,False,2,2016-09-23T00:00:00.000Z,2016-10-01 09:46:37+00:00,2016-10-01 16:46:37+00:00,2016-09-23
 * 2016-09-17 00:11:56,29.9907261,31.2312742,driver_analytics,1472688000000,False,3,2016-09-23T00:00:00.000Z,,,2016-09-23
 * 2016-09-17 00:12:58,29.9870074,31.2175827,driver_analytics,1472688000000,False,4,2016-09-23T00:00:00.000Z,,,2016-09-23`
 *
 * const dataset = {
 *  info: {id: 'test_data', label: 'My Csv'},
 *  data: processCsvData(testData)
 * };
 *
 * dispatch(addDataToMap({
 *  datasets: [dataset],
 *  options: {centerMap: true, readOnly: true}
 * }));
 */
export function processCsvData(rawData: unknown[][] | string, header?: string[]): ProcessorResult {
  let rows: unknown[][] | undefined;
  let headerRow: string[] | undefined;

  if (typeof rawData === 'string') {
    const parsedRows: string[][] = csvParseRows(rawData);

    if (!Array.isArray(parsedRows) || parsedRows.length < 2) {
      // looks like an empty file, throw error to be catch
      throw new Error('process Csv Data Failed: CSV is empty');
    }
    headerRow = parsedRows[0];
    rows = parsedRows.slice(1);
  } else if (Array.isArray(rawData) && rawData.length) {
    rows = rawData;
    headerRow = header;

    if (!Array.isArray(headerRow)) {
      // if data is passed in as array of rows and missing header
      // assume first row is header
      // @ts-ignore
      headerRow = rawData[0];
      rows = rawData.slice(1);
    }
  }

  if (!rows || !headerRow) {
    throw new Error('invalid input passed to processCsvData');
  }

  // here we assume the csv file that people uploaded will have first row
  // as name of the column

  cleanUpFalsyCsvValue(rows);
  // No need to run type detection on every data point
  // here we get a list of none null values to run analyze on
  const sample = getSampleForTypeAnalyze({fields: headerRow, rows});
  const fields = getFieldsFromData(sample, headerRow);
  const parsedRows = parseRowsByFields(rows, fields);

  return {fields, rows: parsedRows};
}

/**
 * Parse rows of csv by analyzed field types. So that `'1'` -> `1`, `'True'` -> `true`
 * @param rows
 * @param fields
 */
export function parseRowsByFields(rows: any[][], fields: Field[]) {
  // Edit rows in place
  const geojsonFieldIdx = fields.findIndex(f => f.name === '_geojson');
  fields.forEach(parseCsvRowsByFieldType.bind(null, rows, geojsonFieldIdx));

  return rows;
}

/**
 * Convert falsy value in csv including `'', 'null', 'NULL', 'Null', 'NaN'` to `null`,
 * so that type-analyzer won't detect it as string
 *
 * @param rows
 */
function cleanUpFalsyCsvValue(rows: unknown[][]): void {
  const re = new RegExp(CSV_NULLS, 'g');
  for (let i = 0; i < rows.length; i++) {
    for (let j = 0; j < rows[i].length; j++) {
      // analyzer will set any fields to 'string' if there are empty values
      // which will be parsed as '' by d3.csv
      // here we parse empty data as null
      // TODO: create warning when deltect `CSV_NULLS` in the data
      if (typeof rows[i][j] === 'string' && (rows[i][j] as string).match(re)) {
        rows[i][j] = null;
      }
    }
  }
}

/**
 * Process uploaded csv file to parse value by field type
 *
 * @param rows
 * @param geoFieldIdx field index
 * @param field
 * @param i
 */
export function parseCsvRowsByFieldType(
  rows: unknown[][],
  geoFieldIdx: number,
  field: Field,
  i: number
): void {
  const parser = PARSE_FIELD_VALUE_FROM_STRING[field.type];
  if (parser) {
    // check first not null value of it's already parsed
    const first = rows.find(r => notNullorUndefined(r[i]));
    if (!first || parser.valid(first[i], field)) {
      return;
    }
    rows.forEach(row => {
      // parse string value based on field type
      if (row[i] !== null) {
        row[i] = parser.parse(row[i], field);
        if (
          geoFieldIdx > -1 &&
          isPlainObject(row[geoFieldIdx]) &&
          // @ts-ignore
          hasOwnProperty(row[geoFieldIdx], 'properties')
        ) {
          // @ts-ignore
          row[geoFieldIdx].properties[field.name] = row[i];
        }
      }
    });
  }
}

/* eslint-enable complexity */

/**
 * Process data where each row is an object, output can be passed to [`addDataToMap`](../actions/actions.md#adddatatomap)
 * NOTE: This function may mutate input.
 * @param rawData an array of row object, each object should have the same number of keys
 * @returns dataset containing `fields` and `rows`
 * @public
 * @example
 * import {addDataToMap} from 'kepler.gl/actions';
 * import {processRowObject} from 'kepler.gl/processors';
 *
 * const data = [
 *  {lat: 31.27, lng: 127.56, value: 3},
 *  {lat: 31.22, lng: 126.26, value: 1}
 * ];
 *
 * dispatch(addDataToMap({
 *  datasets: {
 *    info: {label: 'My Data', id: 'my_data'},
 *    data: processRowObject(data)
 *  }
 * }));
 */
export function processRowObject(rawData: unknown[]): ProcessorResult {
  if (!Array.isArray(rawData)) {
    return null;
  } else if (!rawData.length) {
    // data is empty
    return {
      fields: [],
      rows: []
    };
  }

  const keys = Object.keys(rawData[0]); // [lat, lng, value]
  const rows = rawData.map(d => keys.map(key => d[key])); // [[31.27, 127.56, 3]]

  // row object can still contain values like `Null` or `N/A`
  cleanUpFalsyCsvValue(rows);

  return processCsvData(rows, keys);
}

/**
 * Process GeoJSON [`FeatureCollection`](http://wiki.geojson.org/GeoJSON_draft_version_6#FeatureCollection),
 * output a data object with `{fields: [], rows: []}`.
 * The data object can be wrapped in a `dataset` and passed to [`addDataToMap`](../actions/actions.md#adddatatomap)
 * NOTE: This function may mutate input.
 *
 * @param rawData raw geojson feature collection
 * @returns dataset containing `fields` and `rows`
 * @public
 * @example
 * import {addDataToMap} from 'kepler.gl/actions';
 * import {processGeojson} from 'kepler.gl/processors';
 *
 * const geojson = {
 *         "type" : "FeatureCollection",
 *         "features" : [{
 *                 "type" : "Feature",
 *                 "properties" : {
 *                         "capacity" : "10",
 *                         "type" : "U-Rack"
 *                 },
 *                 "geometry" : {
 *                         "type" : "Point",
 *                         "coordinates" : [ -71.073283, 42.417500 ]
 *                 }
 *         }]
 * };
 *
 * dispatch(addDataToMap({
 *  datasets: {
 *    info: {
 *      label: 'Sample Taxi Trips in New York City',
 *      id: 'test_trip_data'
 *    },
 *    data: processGeojson(geojson)
 *  }
 * }));
 */
export function processGeojson(rawData: unknown): ProcessorResult {
  const normalizedGeojson = normalize(rawData);

  if (!normalizedGeojson || !Array.isArray(normalizedGeojson.features)) {
    throw new Error(
      `Read File Failed: File is not a valid GeoJSON. Read more about [supported file format](${GUIDES_FILE_FORMAT_DOC})`
    );
  }

  // getting all feature fields
  const allDataRows: Array<{_geojson: Feature} & keyof Feature> = [];
  for (let i = 0; i < normalizedGeojson.features.length; i++) {
    const f = normalizedGeojson.features[i];
    if (f.geometry) {
      allDataRows.push({
        // add feature to _geojson field
        _geojson: f,
        ...(f.properties || {})
      });
    }
  }
  // get all the field
  const fields = allDataRows.reduce<string[]>((accu, curr) => {
    Object.keys(curr).forEach(key => {
      if (!accu.includes(key)) {
        accu.push(key);
      }
    });
    return accu;
  }, []);

  // make sure each feature has exact same fields
  allDataRows.forEach(d => {
    fields.forEach(f => {
      if (!(f in d)) {
        d[f] = null;
        if (d._geojson.properties) {
          d._geojson.properties[f] = null;
        }
      }
    });
  });

  return processRowObject(allDataRows);
}

/**
 * Process saved kepler.gl json to be pass to [`addDataToMap`](../actions/actions.md#adddatatomap).
 * The json object should contain `datasets` and `config`.
 * @param rawData
 * @param schema
 * @returns datasets and config `{datasets: {}, config: {}}`
 * @public
 * @example
 * import {addDataToMap} from 'kepler.gl/actions';
 * import {processKeplerglJSON} from 'kepler.gl/processors';
 *
 * dispatch(addDataToMap(processKeplerglJSON(keplerGlJson)));
 */
export function processKeplerglJSON(rawData: SavedMap, schema = KeplerGlSchema): LoadedMap | null {
  return rawData ? schema.load(rawData.datasets, rawData.config) : null;
}

/**
 * Parse a single or an array of datasets saved using kepler.gl schema
 * @param rawData
 * @param schema
 */
export function processKeplerglDataset(
  rawData: object | object[],
  schema = KeplerGlSchema
): ParsedDataset | ParsedDataset[] | null {
  if (!rawData) {
    return null;
  }

  const results = schema.parseSavedData(toArray(rawData));
  if (!results) {
    return null;
  }
  return Array.isArray(rawData) ? results : results[0];
}

/**
 * Parse arrow table and return a dataset
 *
 * @param arrowTable ArrowTable to parse, see loaders.gl/schema
 * @returns dataset containing `fields` and `rows` or null
 */
export function processArrowTable(arrowTable: ArrowTable): ProcessorResult | null {
  // @ts-ignore - Unknown data type causing build failures
  return processArrowBatches(arrowTable.data.batches);
}

/**
 * Extracts GeoArrow metadata from an Apache Arrow table schema.
 * For geoparquet files geoarrow metadata isn't present in fields, so extract extra info from schema.
 * @param table The Apache Arrow table to extract metadata from.
 * @returns An object mapping column names to their GeoArrow encoding type.
 * @throws Logs an error message if parsing of metadata fails.
 */
export function getGeoArrowMetadataFromSchema(table: arrow.Table): Record<string, string> {
  const geoArrowMetadata: Record<string, string> = {};
  try {
    const geoString = table.schema.metadata?.get('geo');
    if (geoString) {
      const parsedGeoString = JSON.parse(geoString);
      if (parsedGeoString.columns) {
        Object.keys(parsedGeoString.columns).forEach(columnName => {
          const columnData = parsedGeoString.columns[columnName];
          if (columnData?.encoding === 'WKB') {
            geoArrowMetadata[columnName] = GEOARROW_EXTENSIONS.WKB;
          }
          // TODO potentially there are other types but no datasets to test
        });
      }
    }
  } catch (error) {
    console.error('An error during arrow table schema metadata parsing');
  }
  return geoArrowMetadata;
}

/**
 * Converts an Apache Arrow table schema into an array of Kepler.gl field objects.
 * @param table The Apache Arrow table whose schema needs to be converted.
 * @param fieldTypeSuggestions Optional mapping of field names to suggested field types.
 * @returns An array of field objects suitable for Kepler.gl.
 */
export function arrowSchemaToFields(
  table: arrow.Table,
  fieldTypeSuggestions: Record<string, string> = {}
): Field[] {
  const headerRow = table.schema.fields.map(f => f.name);
  const sample = getSampleForTypeAnalyzeArrow(table, headerRow);
  const keplerFields = getFieldsFromData(sample, headerRow);
  const geoArrowMetadata = getGeoArrowMetadataFromSchema(table);

  return table.schema.fields.map((field: arrow.Field, fieldIndex: number) => {
    let type = arrowDataTypeToFieldType(field.type);
    let analyzerType = arrowDataTypeToAnalyzerDataType(field.type);
    let format = '';

    // geometry fields produced by DuckDB's st_asgeojson()
    if (fieldTypeSuggestions[field.name] === 'JSON') {
      type = ALL_FIELD_TYPES.geojson;
      analyzerType = AnalyzerDATA_TYPES.GEOMETRY_FROM_STRING;
    } else if (
      fieldTypeSuggestions[field.name] === 'GEOMETRY' ||
      field.metadata.get(GEOARROW_METADATA_KEY)?.startsWith('geoarrow')
    ) {
      type = ALL_FIELD_TYPES.geoarrow;
      analyzerType = AnalyzerDATA_TYPES.GEOMETRY;
    } else if (geoArrowMetadata[field.name]) {
      type = ALL_FIELD_TYPES.geoarrow;
      analyzerType = AnalyzerDATA_TYPES.GEOMETRY;
      field.metadata?.set(GEOARROW_METADATA_KEY, geoArrowMetadata[field.name]);
    } else if (fieldTypeSuggestions[field.name] === 'BLOB') {
      // When arrow wkb column saved to DuckDB as BLOB without any metadata, then queried back
      try {
        const data = table.getChildAt(fieldIndex)?.get(0);
        if (data) {
          const binaryGeo = parseSync(data, WKBLoader);
          if (binaryGeo) {
            type = ALL_FIELD_TYPES.geoarrow;
            analyzerType = AnalyzerDATA_TYPES.GEOMETRY;
            field.metadata?.set(GEOARROW_METADATA_KEY, GEOARROW_EXTENSIONS.WKB);
          }
        }
      } catch (error) {
        // ignore, not WKB
      }
    } else {
      // TODO should we use Kepler getFieldsFromData instead
      // of arrowDataTypeToFieldType for all fields?
      const keplerField = keplerFields[fieldIndex];
      if (keplerField.type === ALL_FIELD_TYPES.timestamp) {
        type = keplerField.type;
        analyzerType = keplerField.analyzerType;
        format = keplerField.format;
      }
    }

    return {
      ...field,
      name: field.name,
      id: field.name,
      displayName: field.name,
      format: format,
      fieldIdx: fieldIndex,
      type,
      analyzerType,
      valueAccessor: (dc: any) => d => {
        return dc.valueAt(d.index, fieldIndex);
      },
      metadata: field.metadata
    };
  });
}

/**
 * Parse arrow batches returned from parseInBatches()
 *
 * @param arrowTable the arrow table to parse
 * @returns dataset containing `fields` and `rows` or null
 */
export function processArrowBatches(arrowBatches: arrow.RecordBatch[]): ProcessorResult | null {
  if (arrowBatches.length === 0) {
    return null;
  }
  const arrowTable = new arrow.Table(arrowBatches);
  const fields = arrowSchemaToFields(arrowTable);

  const cols = [...Array(arrowTable.numCols).keys()].map(i => arrowTable.getChildAt(i));

  // return empty rows and use raw arrow table to construct column-wise data container
  return {
    fields,
    rows: [],
    cols,
    metadata: arrowTable.schema.metadata,
    // Save original arrow schema, for better ingestion into DuckDB.
    // TODO consider returning arrowTable in cols, not an array of Vectors from arrowTable.
    arrowSchema: arrowTable.schema
  };
}

export const DATASET_HANDLERS = {
  [DATASET_FORMATS.row]: processRowObject,
  [DATASET_FORMATS.geojson]: processGeojson,
  [DATASET_FORMATS.csv]: processCsvData,
  [DATASET_FORMATS.arrow]: processArrowTable,
  [DATASET_FORMATS.keplergl]: processKeplerglDataset
};

export const Processors: {
  processGeojson: typeof processGeojson;
  processCsvData: typeof processCsvData;
  processArrowTable: typeof processArrowTable;
  processArrowBatches: typeof processArrowBatches;
  processRowObject: typeof processRowObject;
  processKeplerglJSON: typeof processKeplerglJSON;
  processKeplerglDataset: typeof processKeplerglDataset;
  analyzerTypeToFieldType: typeof analyzerTypeToFieldType;
  getFieldsFromData: typeof getFieldsFromData;
  parseCsvRowsByFieldType: typeof parseCsvRowsByFieldType;
} = {
  processGeojson,
  processCsvData,
  processArrowTable,
  processArrowBatches,
  processRowObject,
  processKeplerglJSON,
  processKeplerglDataset,
  analyzerTypeToFieldType,
  getFieldsFromData,
  parseCsvRowsByFieldType
};

1	// SPDX-License-Identifier: MIT
2	// Copyright contributors to the kepler.gl project
3
4	import * as arrow from 'apache-arrow';
5	import {csvParseRows} from 'd3-dsv';
6	import {DATA_TYPES as AnalyzerDATA_TYPES} from 'type-analyzer';
7	import normalize from '@mapbox/geojson-normalize';
8	import {parseSync} from '@loaders.gl/core';
9	import {ArrowTable} from '@loaders.gl/schema';
10	import {WKBLoader} from '@loaders.gl/wkt';
11
12	import {
13	ALL_FIELD_TYPES,
14	DATASET_FORMATS,
15	GEOARROW_EXTENSIONS,
16	GEOARROW_METADATA_KEY,
17	GUIDES_FILE_FORMAT_DOC
18	} from '@kepler.gl/constants';
19	import {ProcessorResult, Field} from '@kepler.gl/types';
20	import {
21	arrowDataTypeToAnalyzerDataType,
22	arrowDataTypeToFieldType,
23	hasOwnProperty,
24	isPlainObject
25	} from '@kepler.gl/utils';
26	import {
27	analyzerTypeToFieldType,
28	getSampleForTypeAnalyze,
29	getSampleForTypeAnalyzeArrow,
30	getFieldsFromData,
31	h3IsValid,
32	notNullorUndefined,
33	toArray
34	} from '@kepler.gl/common-utils';
35	import {KeplerGlSchema, ParsedDataset, SavedMap, LoadedMap} from '@kepler.gl/schemas';
36	import {Feature} from '@nebula.gl/edit-modes';
37
38	// if any of these value occurs in csv, parse it to null;
39	// const CSV_NULLS = ['', 'null', 'NULL', 'Null', 'NaN', '/N'];
40	// matches empty string
41	export const CSV_NULLS = /^(null\|NULL\|Null\|NaN\|\/N\|\|)$/;	13✔
42
43	function tryParseJsonString(str) {
44	try {	31✔
45	return JSON.parse(str);	31✔
46	} catch (e) {
47	return null;	×
48	}
49	}
50
51	export const PARSE_FIELD_VALUE_FROM_STRING = {	13✔
52	[ALL_FIELD_TYPES.boolean]: {
53	valid: (d: unknown): boolean => typeof d === 'boolean',	29✔
54	parse: (d: unknown): boolean => d === 'true' \|\| d === 'True' \|\| d === 'TRUE' \|\| d === '1'	359✔
55	},
56	[ALL_FIELD_TYPES.integer]: {
57	// @ts-ignore
58	valid: (d: unknown): boolean => parseInt(d, 10) === d,	121✔
59	// @ts-ignore
60	parse: (d: unknown): number => parseInt(d, 10)	494✔
61	},
62	[ALL_FIELD_TYPES.timestamp]: {
63	valid: (d: unknown, field: Field): boolean =>
64	['x', 'X'].includes(field.format) ? typeof d === 'number' : typeof d === 'string',	100✔
65	parse: (d: any, field: Field) => (['x', 'X'].includes(field.format) ? Number(d) : d)	362!
66	},
67	[ALL_FIELD_TYPES.real]: {
68	// @ts-ignore
69	valid: (d: unknown): boolean => parseFloat(d) === d,	124✔
70	// Note this will result in NaN for some string
71	parse: parseFloat
72	},
73	[ALL_FIELD_TYPES.object]: {
74	valid: isPlainObject,
75	parse: tryParseJsonString
76	},
77
78	[ALL_FIELD_TYPES.array]: {
79	valid: Array.isArray,
80	parse: tryParseJsonString
81	},
82
83	[ALL_FIELD_TYPES.h3]: {
84	valid: d => h3IsValid(d),	15✔
85	parse: d => d	×
86	}
87	};
88
89	/**
90	* Process csv data, output a data object with `{fields: [], rows: []}`.
91	* The data object can be wrapped in a `dataset` and pass to [`addDataToMap`](../actions/actions.md#adddatatomap)
92	* @param rawData raw csv string
93	* @returns data object `{fields: [], rows: []}` can be passed to addDataToMaps
94	* @public
95	* @example
96	* import {processCsvData} from 'kepler.gl/processors';
97	*
98	* const testData = `gps_data.utc_timestamp,gps_data.lat,gps_data.lng,gps_data.types,epoch,has_result,id,time,begintrip_ts_utc,begintrip_ts_local,date
99	* 2016-09-17 00:09:55,29.9900937,31.2590542,driver_analytics,1472688000000,False,1,2016-09-23T00:00:00.000Z,2016-10-01 09:41:39+00:00,2016-10-01 09:41:39+00:00,2016-09-23
100	* 2016-09-17 00:10:56,29.9927699,31.2461142,driver_analytics,1472688000000,False,2,2016-09-23T00:00:00.000Z,2016-10-01 09:46:37+00:00,2016-10-01 16:46:37+00:00,2016-09-23
101	* 2016-09-17 00:11:56,29.9907261,31.2312742,driver_analytics,1472688000000,False,3,2016-09-23T00:00:00.000Z,,,2016-09-23
102	* 2016-09-17 00:12:58,29.9870074,31.2175827,driver_analytics,1472688000000,False,4,2016-09-23T00:00:00.000Z,,,2016-09-23`
103	*
104	* const dataset = {
105	* info: {id: 'test_data', label: 'My Csv'},
106	* data: processCsvData(testData)
107	* };
108	*
109	* dispatch(addDataToMap({
110	* datasets: [dataset],
111	* options: {centerMap: true, readOnly: true}
112	* }));
113	*/
114	export function processCsvData(rawData: unknown[][] \| string, header?: string[]): ProcessorResult {
115	let rows: unknown[][] \| undefined;
116	let headerRow: string[] \| undefined;
117
118	if (typeof rawData === 'string') {	75✔
119	const parsedRows: string[][] = csvParseRows(rawData);	39✔
120
121	if (!Array.isArray(parsedRows) \|\| parsedRows.length < 2) {	39✔
122	// looks like an empty file, throw error to be catch
123	throw new Error('process Csv Data Failed: CSV is empty');	1✔
124	}
125	headerRow = parsedRows[0];	38✔
126	rows = parsedRows.slice(1);	38✔
127	} else if (Array.isArray(rawData) && rawData.length) {	36!
128	rows = rawData;	36✔
129	headerRow = header;	36✔
130
131	if (!Array.isArray(headerRow)) {	36!
132	// if data is passed in as array of rows and missing header
133	// assume first row is header
134	// @ts-ignore
135	headerRow = rawData[0];	×
136	rows = rawData.slice(1);	×
137	}
138	}
139
140	if (!rows \|\| !headerRow) {	74!
141	throw new Error('invalid input passed to processCsvData');	×
142	}
143
144	// here we assume the csv file that people uploaded will have first row
145	// as name of the column
146
147	cleanUpFalsyCsvValue(rows);	74✔
148	// No need to run type detection on every data point
149	// here we get a list of none null values to run analyze on
150	const sample = getSampleForTypeAnalyze({fields: headerRow, rows});	74✔
151	const fields = getFieldsFromData(sample, headerRow);	74✔
152	const parsedRows = parseRowsByFields(rows, fields);	74✔
153
154	return {fields, rows: parsedRows};	74✔
155	}
156
157	/**
158	* Parse rows of csv by analyzed field types. So that `'1'` -> `1`, `'True'` -> `true`
159	* @param rows
160	* @param fields
161	*/
162	export function parseRowsByFields(rows: any[][], fields: Field[]) {
163	// Edit rows in place
164	const geojsonFieldIdx = fields.findIndex(f => f.name === '_geojson');	439✔
165	fields.forEach(parseCsvRowsByFieldType.bind(null, rows, geojsonFieldIdx));	74✔
166
167	return rows;	74✔
168	}
169
170	/**
171	* Convert falsy value in csv including `'', 'null', 'NULL', 'Null', 'NaN'` to `null`,
172	* so that type-analyzer won't detect it as string
173	*
174	* @param rows
175	*/
176	function cleanUpFalsyCsvValue(rows: unknown[][]): void {
177	const re = new RegExp(CSV_NULLS, 'g');	110✔
178	for (let i = 0; i < rows.length; i++) {	110✔
179	for (let j = 0; j < rows[i].length; j++) {	1,036✔
180	// analyzer will set any fields to 'string' if there are empty values
181	// which will be parsed as '' by d3.csv
182	// here we parse empty data as null
183	// TODO: create warning when deltect `CSV_NULLS` in the data
184	if (typeof rows[i][j] === 'string' && (rows[i][j] as string).match(re)) {	8,486✔
185	rows[i][j] = null;	905✔
186	}
187	}
188	}
189	}
190
191	/**
192	* Process uploaded csv file to parse value by field type
193	*
194	* @param rows
195	* @param geoFieldIdx field index
196	* @param field
197	* @param i
198	*/
199	export function parseCsvRowsByFieldType(
200	rows: unknown[][],
201	geoFieldIdx: number,
202	field: Field,
203	i: number
204	): void {
205	const parser = PARSE_FIELD_VALUE_FROM_STRING[field.type];	563✔
206	if (parser) {	563✔
207	// check first not null value of it's already parsed
208	const first = rows.find(r => notNullorUndefined(r[i]));	441✔
209	if (!first \|\| parser.valid(first[i], field)) {	418✔
210	return;	210✔
211	}
212	rows.forEach(row => {	208✔
213	// parse string value based on field type
214	if (row[i] !== null) {	2,938✔
215	row[i] = parser.parse(row[i], field);	2,635✔
216	if (	2,635✔
217	geoFieldIdx > -1 &&	2,653✔
218	isPlainObject(row[geoFieldIdx]) &&
219	// @ts-ignore
220	hasOwnProperty(row[geoFieldIdx], 'properties')
221	) {
222	// @ts-ignore
223	row[geoFieldIdx].properties[field.name] = row[i];	9✔
224	}
225	}
226	});
227	}
228	}
229
230	/* eslint-enable complexity */
231
232	/**
233	* Process data where each row is an object, output can be passed to [`addDataToMap`](../actions/actions.md#adddatatomap)
234	* NOTE: This function may mutate input.
235	* @param rawData an array of row object, each object should have the same number of keys
236	* @returns dataset containing `fields` and `rows`
237	* @public
238	* @example
239	* import {addDataToMap} from 'kepler.gl/actions';
240	* import {processRowObject} from 'kepler.gl/processors';
241	*
242	* const data = [
243	* {lat: 31.27, lng: 127.56, value: 3},
244	* {lat: 31.22, lng: 126.26, value: 1}
245	* ];
246	*
247	* dispatch(addDataToMap({
248	* datasets: {
249	* info: {label: 'My Data', id: 'my_data'},
250	* data: processRowObject(data)
251	* }
252	* }));
253	*/
254	export function processRowObject(rawData: unknown[]): ProcessorResult {
255	if (!Array.isArray(rawData)) {	37✔
256	return null;	1✔
257	} else if (!rawData.length) {	36!
258	// data is empty
259	return {	×
260	fields: [],
261	rows: []
262	};
263	}
264
265	const keys = Object.keys(rawData[0]); // [lat, lng, value]	36✔
266	const rows = rawData.map(d => keys.map(key => d[key])); // [[31.27, 127.56, 3]]	1,551✔
267
268	// row object can still contain values like `Null` or `N/A`
269	cleanUpFalsyCsvValue(rows);	36✔
270
271	return processCsvData(rows, keys);	36✔
272	}
273
274	/**
275	* Process GeoJSON [`FeatureCollection`](http://wiki.geojson.org/GeoJSON_draft_version_6#FeatureCollection),
276	* output a data object with `{fields: [], rows: []}`.
277	* The data object can be wrapped in a `dataset` and passed to [`addDataToMap`](../actions/actions.md#adddatatomap)
278	* NOTE: This function may mutate input.
279	*
280	* @param rawData raw geojson feature collection
281	* @returns dataset containing `fields` and `rows`
282	* @public
283	* @example
284	* import {addDataToMap} from 'kepler.gl/actions';
285	* import {processGeojson} from 'kepler.gl/processors';
286	*
287	* const geojson = {
288	* "type" : "FeatureCollection",
289	* "features" : [{
290	* "type" : "Feature",
291	* "properties" : {
292	* "capacity" : "10",
293	* "type" : "U-Rack"
294	* },
295	* "geometry" : {
296	* "type" : "Point",
297	* "coordinates" : [ -71.073283, 42.417500 ]
298	* }
299	* }]
300	* };
301	*
302	* dispatch(addDataToMap({
303	* datasets: {
304	* info: {
305	* label: 'Sample Taxi Trips in New York City',
306	* id: 'test_trip_data'
307	* },
308	* data: processGeojson(geojson)
309	* }
310	* }));
311	*/
312	export function processGeojson(rawData: unknown): ProcessorResult {
313	const normalizedGeojson = normalize(rawData);	28✔
314
315	if (!normalizedGeojson \|\| !Array.isArray(normalizedGeojson.features)) {	28✔
316	throw new Error(	1✔
317	`Read File Failed: File is not a valid GeoJSON. Read more about [supported file format](${GUIDES_FILE_FORMAT_DOC})`
318	);
319	}
320
321	// getting all feature fields
322	const allDataRows: Array<{_geojson: Feature} & keyof Feature> = [];	27✔
323	for (let i = 0; i < normalizedGeojson.features.length; i++) {	27✔
324	const f = normalizedGeojson.features[i];	160✔
325	if (f.geometry) {	160!
326	allDataRows.push({	160✔
327	// add feature to _geojson field
328	_geojson: f,
329	...(f.properties \|\| {})	161✔
330	});
331	}
332	}
333	// get all the field
334	const fields = allDataRows.reduce<string[]>((accu, curr) => {	27✔
335	Object.keys(curr).forEach(key => {	160✔
336	if (!accu.includes(key)) {	807✔
337	accu.push(key);	148✔
338	}
339	});
340	return accu;	160✔
341	}, []);
342
343	// make sure each feature has exact same fields
344	allDataRows.forEach(d => {	27✔
345	fields.forEach(f => {	160✔
346	if (!(f in d)) {	860✔
347	d[f] = null;	53✔
348	if (d._geojson.properties) {	53!
349	d._geojson.properties[f] = null;	53✔
350	}
351	}
352	});
353	});
354
355	return processRowObject(allDataRows);	27✔
356	}
357
358	/**
359	* Process saved kepler.gl json to be pass to [`addDataToMap`](../actions/actions.md#adddatatomap).
360	* The json object should contain `datasets` and `config`.
361	* @param rawData
362	* @param schema
363	* @returns datasets and config `{datasets: {}, config: {}}`
364	* @public
365	* @example
366	* import {addDataToMap} from 'kepler.gl/actions';
367	* import {processKeplerglJSON} from 'kepler.gl/processors';
368	*
369	* dispatch(addDataToMap(processKeplerglJSON(keplerGlJson)));
370	*/
371	export function processKeplerglJSON(rawData: SavedMap, schema = KeplerGlSchema): LoadedMap \| null {	5✔
372	return rawData ? schema.load(rawData.datasets, rawData.config) : null;	5!
373	}
374
375	/**
376	* Parse a single or an array of datasets saved using kepler.gl schema
377	* @param rawData
378	* @param schema
379	*/
380	export function processKeplerglDataset(
381	rawData: object \| object[],
382	schema = KeplerGlSchema	×
383	): ParsedDataset \| ParsedDataset[] \| null {
384	if (!rawData) {	×
385	return null;	×
386	}
387
388	const results = schema.parseSavedData(toArray(rawData));	×
389	if (!results) {	×
390	return null;	×
391	}
392	return Array.isArray(rawData) ? results : results[0];	×
393	}
394
395	/**
396	* Parse arrow table and return a dataset
397	*
398	* @param arrowTable ArrowTable to parse, see loaders.gl/schema
399	* @returns dataset containing `fields` and `rows` or null
400	*/
401	export function processArrowTable(arrowTable: ArrowTable): ProcessorResult \| null {
402	// @ts-ignore - Unknown data type causing build failures
403	return processArrowBatches(arrowTable.data.batches);	×
404	}
405
406	/**
407	* Extracts GeoArrow metadata from an Apache Arrow table schema.
408	* For geoparquet files geoarrow metadata isn't present in fields, so extract extra info from schema.
409	* @param table The Apache Arrow table to extract metadata from.
410	* @returns An object mapping column names to their GeoArrow encoding type.
411	* @throws Logs an error message if parsing of metadata fails.
412	*/
413	export function getGeoArrowMetadataFromSchema(table: arrow.Table): Record<string, string> {
NEW 414	const geoArrowMetadata: Record<string, string> = {};	×
NEW 415	try {	×
NEW 416	const geoString = table.schema.metadata?.get('geo');	×
NEW 417	if (geoString) {	×
NEW 418	const parsedGeoString = JSON.parse(geoString);	×
NEW 419	if (parsedGeoString.columns) {	×
NEW 420	Object.keys(parsedGeoString.columns).forEach(columnName => {	×
NEW 421	const columnData = parsedGeoString.columns[columnName];	×
NEW 422	if (columnData?.encoding === 'WKB') {	×
NEW 423	geoArrowMetadata[columnName] = GEOARROW_EXTENSIONS.WKB;	×
424	}
425	// TODO potentially there are other types but no datasets to test
426	});
427	}
428	}
429	} catch (error) {
NEW 430	console.error('An error during arrow table schema metadata parsing');	×
431	}
NEW 432	return geoArrowMetadata;	×
433	}
434
435	/**
436	* Converts an Apache Arrow table schema into an array of Kepler.gl field objects.
437	* @param table The Apache Arrow table whose schema needs to be converted.
438	* @param fieldTypeSuggestions Optional mapping of field names to suggested field types.
439	* @returns An array of field objects suitable for Kepler.gl.
440	*/
441	export function arrowSchemaToFields(
442	table: arrow.Table,
443	fieldTypeSuggestions: Record<string, string> = {}	×
444	): Field[] {
NEW 445	const headerRow = table.schema.fields.map(f => f.name);	×
NEW 446	const sample = getSampleForTypeAnalyzeArrow(table, headerRow);	×
NEW 447	const keplerFields = getFieldsFromData(sample, headerRow);	×
NEW 448	const geoArrowMetadata = getGeoArrowMetadataFromSchema(table);	×
449
NEW 450	return table.schema.fields.map((field: arrow.Field, fieldIndex: number) => {	×
NEW 451	let type = arrowDataTypeToFieldType(field.type);	×
NEW 452	let analyzerType = arrowDataTypeToAnalyzerDataType(field.type);	×
NEW 453	let format = '';	×
454
455	// geometry fields produced by DuckDB's st_asgeojson()
NEW 456	if (fieldTypeSuggestions[field.name] === 'JSON') {	×
NEW 457	type = ALL_FIELD_TYPES.geojson;	×
NEW 458	analyzerType = AnalyzerDATA_TYPES.GEOMETRY_FROM_STRING;	×
NEW 459	} else if (	×
460	fieldTypeSuggestions[field.name] === 'GEOMETRY' \|\|	×
461	field.metadata.get(GEOARROW_METADATA_KEY)?.startsWith('geoarrow')
462	) {
NEW 463	type = ALL_FIELD_TYPES.geoarrow;	×
NEW 464	analyzerType = AnalyzerDATA_TYPES.GEOMETRY;	×
NEW 465	} else if (geoArrowMetadata[field.name]) {	×
NEW 466	type = ALL_FIELD_TYPES.geoarrow;	×
NEW 467	analyzerType = AnalyzerDATA_TYPES.GEOMETRY;	×
NEW 468	field.metadata?.set(GEOARROW_METADATA_KEY, geoArrowMetadata[field.name]);	×
NEW 469	} else if (fieldTypeSuggestions[field.name] === 'BLOB') {	×
470	// When arrow wkb column saved to DuckDB as BLOB without any metadata, then queried back
NEW 471	try {	×
NEW 472	const data = table.getChildAt(fieldIndex)?.get(0);	×
NEW 473	if (data) {	×
NEW 474	const binaryGeo = parseSync(data, WKBLoader);	×
NEW 475	if (binaryGeo) {	×
NEW 476	type = ALL_FIELD_TYPES.geoarrow;	×
NEW 477	analyzerType = AnalyzerDATA_TYPES.GEOMETRY;	×
NEW 478	field.metadata?.set(GEOARROW_METADATA_KEY, GEOARROW_EXTENSIONS.WKB);	×
479	}
480	}
481	} catch (error) {
482	// ignore, not WKB
483	}
484	} else {
485	// TODO should we use Kepler getFieldsFromData instead
486	// of arrowDataTypeToFieldType for all fields?
NEW 487	const keplerField = keplerFields[fieldIndex];	×
NEW 488	if (keplerField.type === ALL_FIELD_TYPES.timestamp) {	×
NEW 489	type = keplerField.type;	×
NEW 490	analyzerType = keplerField.analyzerType;	×
NEW 491	format = keplerField.format;	×
492	}
493	}
494
UNCOV 495	return {	×
496	...field,
497	name: field.name,
498	id: field.name,
499	displayName: field.name,
500	format: format,
501	fieldIdx: fieldIndex,
502	type,
503	analyzerType,
504	valueAccessor: (dc: any) => d => {	×
NEW 505	return dc.valueAt(d.index, fieldIndex);	×
506	},
507	metadata: field.metadata
508	};
509	});
510	}
511
512	/**
513	* Parse arrow batches returned from parseInBatches()
514	*
515	* @param arrowTable the arrow table to parse
516	* @returns dataset containing `fields` and `rows` or null
517	*/
518	export function processArrowBatches(arrowBatches: arrow.RecordBatch[]): ProcessorResult \| null {
519	if (arrowBatches.length === 0) {	×
520	return null;	×
521	}
522	const arrowTable = new arrow.Table(arrowBatches);	×
NEW 523	const fields = arrowSchemaToFields(arrowTable);	×
524
525	const cols = [...Array(arrowTable.numCols).keys()].map(i => arrowTable.getChildAt(i));	×
526
527	// return empty rows and use raw arrow table to construct column-wise data container
NEW 528	return {	×
529	fields,
530	rows: [],
531	cols,
532	metadata: arrowTable.schema.metadata,
533	// Save original arrow schema, for better ingestion into DuckDB.
534	// TODO consider returning arrowTable in cols, not an array of Vectors from arrowTable.
535	arrowSchema: arrowTable.schema
536	};
537	}
538
539	export const DATASET_HANDLERS = {	13✔
540	[DATASET_FORMATS.row]: processRowObject,
541	[DATASET_FORMATS.geojson]: processGeojson,
542	[DATASET_FORMATS.csv]: processCsvData,
543	[DATASET_FORMATS.arrow]: processArrowTable,
544	[DATASET_FORMATS.keplergl]: processKeplerglDataset
545	};
546
547	export const Processors: {
548	processGeojson: typeof processGeojson;
549	processCsvData: typeof processCsvData;
550	processArrowTable: typeof processArrowTable;
551	processArrowBatches: typeof processArrowBatches;
552	processRowObject: typeof processRowObject;
553	processKeplerglJSON: typeof processKeplerglJSON;
554	processKeplerglDataset: typeof processKeplerglDataset;
555	analyzerTypeToFieldType: typeof analyzerTypeToFieldType;
556	getFieldsFromData: typeof getFieldsFromData;
557	parseCsvRowsByFieldType: typeof parseCsvRowsByFieldType;
558	} = {	13✔
559	processGeojson,
560	processCsvData,
561	processArrowTable,
562	processArrowBatches,
563	processRowObject,
564	processKeplerglJSON,
565	processKeplerglDataset,
566	analyzerTypeToFieldType,
567	getFieldsFromData,
568	parseCsvRowsByFieldType
569	};

keplergl / kepler.gl / 13395431770

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous