• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

visgl / loaders.gl / 24108422669

07 Apr 2026 10:56PM UTC coverage: 35.134% (-0.3%) from 35.411%
24108422669

push

github

web-flow
feat(csv) CSVArrowLoader (#3345)

1225 of 2058 branches covered (59.52%)

Branch coverage included in aggregate %.

568 of 2529 new or added lines in 12 files covered. (22.46%)

2 existing lines in 2 files now uncovered.

39940 of 115107 relevant lines covered (34.7%)

0.77 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

34.67
/modules/csv/src/csv-loader.ts
1
// loaders.gl
1✔
2
// SPDX-License-Identifier: MIT
1✔
3
// Copyright (c) vis.gl contributors
1✔
4

1✔
5
import type {LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils';
1✔
6
import type {Schema, ArrayRowTable, ObjectRowTable, TableBatch} from '@loaders.gl/schema';
1✔
7

1✔
8
import {log, toArrayBufferIterator} from '@loaders.gl/loader-utils';
1✔
9
import {
1✔
10
  AsyncQueue,
1✔
11
  deduceTableSchema,
1✔
12
  TableBatchBuilder,
1✔
13
  convertToArrayRow,
1✔
14
  convertToObjectRow
1✔
15
} from '@loaders.gl/schema-utils';
1✔
16
import Papa from './papaparse/papaparse';
1✔
17
import AsyncIteratorStreamer from './papaparse/async-iterator-streamer';
1✔
18
import {CSVFormat} from './csv-format';
1✔
19

1✔
20
// __VERSION__ is injected by babel-plugin-version-inline
1✔
21
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
1✔
22
const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest';
1!
23

1✔
24
const DEFAULT_CSV_SHAPE = 'object-row-table';
1✔
25

1✔
26
export type CSVLoaderOptions = LoaderOptions & {
1✔
27
  csv?: {
1✔
28
    // loaders.gl options
1✔
29
    shape?: 'array-row-table' | 'object-row-table';
1✔
30
    /** optimizes memory usage but increases parsing time. */
1✔
31
    optimizeMemoryUsage?: boolean;
1✔
32
    columnPrefix?: string;
1✔
33
    header?: 'auto';
1✔
34

1✔
35
    // CSV options (papaparse)
1✔
36
    // delimiter: auto
1✔
37
    // newline: auto
1✔
38
    quoteChar?: string;
1✔
39
    escapeChar?: string;
1✔
40
    // Convert numbers and boolean values in rows from strings
1✔
41
    dynamicTyping?: boolean;
1✔
42
    comments?: boolean;
1✔
43
    skipEmptyLines?: boolean | 'greedy';
1✔
44
    // transform: null?
1✔
45
    delimitersToGuess?: string[];
1✔
46
    // fastMode: auto
1✔
47
  };
1✔
48
};
1✔
49

1✔
50
export const CSVLoader = {
1✔
51
  ...CSVFormat,
1✔
52

1✔
53
  dataType: null as unknown as ObjectRowTable | ArrayRowTable,
1✔
54
  batchType: null as unknown as TableBatch,
1✔
55
  version: VERSION,
1✔
56
  parse: async (arrayBuffer: ArrayBuffer, options?: CSVLoaderOptions) =>
1✔
57
    parseCSV(new TextDecoder().decode(arrayBuffer), options),
×
58
  parseText: (text: string, options?: CSVLoaderOptions) => parseCSV(text, options),
1✔
59
  parseInBatches: parseCSVInBatches,
1✔
60
  // @ts-ignore
1✔
61
  // testText: null,
1✔
62
  options: {
1✔
63
    csv: {
1✔
64
      shape: DEFAULT_CSV_SHAPE, // 'object-row-table'
1✔
65
      optimizeMemoryUsage: false,
1✔
66
      // CSV options
1✔
67
      header: 'auto',
1✔
68
      columnPrefix: 'column',
1✔
69
      // delimiter: auto
1✔
70
      // newline: auto
1✔
71
      quoteChar: '"',
1✔
72
      escapeChar: '"',
1✔
73
      dynamicTyping: true,
1✔
74
      comments: false,
1✔
75
      skipEmptyLines: true,
1✔
76
      // transform: null?
1✔
77
      delimitersToGuess: [',', '\t', '|', ';']
1✔
78
      // fastMode: auto
1✔
79
    }
1✔
80
  }
1✔
81
} as const satisfies LoaderWithParser<ObjectRowTable | ArrayRowTable, TableBatch, CSVLoaderOptions>;
1✔
82

1✔
83
async function parseCSV(
×
84
  csvText: string,
×
85
  options?: CSVLoaderOptions
×
86
): Promise<ObjectRowTable | ArrayRowTable> {
×
87
  // Apps can call the parse method directly, so we apply default options here
×
88
  const csvOptions = {...CSVLoader.options.csv, ...options?.csv};
×
89

×
90
  const firstRow = readFirstRow(csvText);
×
91
  const header: boolean =
×
92
    csvOptions.header === 'auto' ? isHeaderRow(firstRow) : Boolean(csvOptions.header);
×
93

×
94
  const parseWithHeader = header;
×
95

×
96
  const papaparseConfig = {
×
97
    // dynamicTyping: true,
×
98
    ...csvOptions,
×
99
    header: parseWithHeader,
×
100
    download: false, // We handle loading, no need for papaparse to do it for us
×
101
    transformHeader: parseWithHeader ? duplicateColumnTransformer() : undefined,
×
102
    error: (e) => {
×
103
      throw new Error(e);
×
104
    }
×
105
  };
×
106

×
107
  const result = Papa.parse(csvText, papaparseConfig);
×
108
  const rows = result.data as any[];
×
109

×
110
  const headerRow = result.meta.fields || generateHeader(csvOptions.columnPrefix, firstRow.length);
×
111

×
112
  const shape = csvOptions.shape || DEFAULT_CSV_SHAPE;
×
113
  let table: ArrayRowTable | ObjectRowTable;
×
114
  switch (shape) {
×
115
    case 'object-row-table':
×
116
      table = {
×
117
        shape: 'object-row-table',
×
118
        data: rows.map((row) => (Array.isArray(row) ? convertToObjectRow(row, headerRow) : row))
×
119
      };
×
120
      break;
×
121
    case 'array-row-table':
×
122
      table = {
×
123
        shape: 'array-row-table',
×
124
        data: rows.map((row) => (Array.isArray(row) ? row : convertToArrayRow(row, headerRow)))
×
125
      };
×
126
      break;
×
127
    default:
×
128
      throw new Error(shape);
×
129
  }
×
130
  table.schema = deduceTableSchema(table!);
×
131
  return table;
×
132
}
×
133

1✔
134
// TODO - support batch size 0 = no batching/single batch?
1✔
135
function parseCSVInBatches(
×
136
  asyncIterator:
×
137
    | AsyncIterable<ArrayBufferLike | ArrayBufferView>
×
138
    | Iterable<ArrayBufferLike | ArrayBufferView>,
×
139
  options?: CSVLoaderOptions
×
140
): AsyncIterable<TableBatch> {
×
141
  // Papaparse does not support standard batch size handling
×
142
  // TODO - investigate papaparse chunks mode
×
143
  options = {...options};
×
144
  if (options?.core?.batchSize === 'auto') {
×
145
    options.core.batchSize = 4000;
×
146
  }
×
147

×
148
  // Apps can call the parse method directly, we so apply default options here
×
149
  const csvOptions = {...CSVLoader.options.csv, ...options?.csv};
×
150

×
151
  const asyncQueue = new AsyncQueue<TableBatch>();
×
152

×
153
  let isFirstRow: boolean = true;
×
154
  let headerRow: string[] | null = null;
×
155
  let tableBatchBuilder: TableBatchBuilder | null = null;
×
156
  let schema: Schema | null = null;
×
157

×
158
  const config = {
×
159
    // dynamicTyping: true, // Convert numbers and boolean values in rows from strings,
×
160
    ...csvOptions,
×
161
    header: false, // Unfortunately, header detection is not automatic and does not infer shapes
×
162
    download: false, // We handle loading, no need for papaparse to do it for us
×
163
    // chunkSize is set to 5MB explicitly (same as Papaparse default) due to a bug where the
×
164
    // streaming parser gets stuck if skipEmptyLines and a step callback are both supplied.
×
165
    // See https://github.com/mholt/PapaParse/issues/465
×
166
    chunkSize: 1024 * 1024 * 5,
×
167
    // skipEmptyLines is set to a boolean value if supplied. Greedy is set to true
×
168
    // skipEmptyLines is handled manually given two bugs where the streaming parser gets stuck if
×
169
    // both of the skipEmptyLines and step callback options are provided:
×
170
    // - true doesn't work unless chunkSize is set: https://github.com/mholt/PapaParse/issues/465
×
171
    // - greedy doesn't work: https://github.com/mholt/PapaParse/issues/825
×
172
    skipEmptyLines: false,
×
173

×
174
    // step is called on every row
×
175
    // eslint-disable-next-line complexity, max-statements
×
176
    step(results) {
×
177
      let row = results.data;
×
178

×
NEW
179
      if (csvOptions.skipEmptyLines === 'greedy') {
×
180
        // Manually reject lines that are empty
×
181
        const collapsedRow = row.flat().join('').trim();
×
182
        if (collapsedRow === '') {
×
183
          return;
×
184
        }
×
NEW
185
      } else if (csvOptions.skipEmptyLines === true) {
×
NEW
186
        row = normalizePapaStreamingRow(row);
×
NEW
187
        if (row.length === 1 && row[0] === null) {
×
NEW
188
          return;
×
NEW
189
        }
×
190
      }
×
191
      const bytesUsed = results.meta.cursor;
×
192

×
193
      // Check if we need to save a header row
×
194
      if (isFirstRow && !headerRow) {
×
195
        // Auto detects or can be forced with csvOptions.header
×
196
        const header = csvOptions.header === 'auto' ? isHeaderRow(row) : Boolean(csvOptions.header);
×
197
        if (header) {
×
198
          headerRow = row.map(duplicateColumnTransformer());
×
199
          return;
×
200
        }
×
201
      }
×
202

×
203
      // If first data row, we can deduce the schema
×
204
      if (isFirstRow) {
×
205
        isFirstRow = false;
×
206
        if (!headerRow) {
×
207
          headerRow = generateHeader(csvOptions.columnPrefix, row.length);
×
208
        }
×
209
        schema = deduceCSVSchema(row, headerRow);
×
210
      }
×
211

×
212
      if (csvOptions.optimizeMemoryUsage) {
×
213
        // A workaround to allocate new strings and don't retain pointers to original strings.
×
214
        // https://bugs.chromium.org/p/v8/issues/detail?id=2869
×
215
        row = JSON.parse(JSON.stringify(row));
×
216
      }
×
217

×
218
      const shape = (options as any)?.shape || csvOptions.shape || DEFAULT_CSV_SHAPE;
×
NEW
219
      if (shape === 'object-row-table' && headerRow && row.length > headerRow.length) {
×
NEW
220
        row = convertToPapaObjectRow(row, headerRow);
×
NEW
221
      }
×
222

×
223
      // Add the row
×
224
      tableBatchBuilder =
×
225
        tableBatchBuilder ||
×
226
        new TableBatchBuilder(
×
227
          // @ts-expect-error TODO this is not a proper schema
×
228
          schema,
×
229
          {
×
230
            shape,
×
231
            ...(options?.core || {})
×
232
          }
×
233
        );
×
234

×
235
      try {
×
236
        tableBatchBuilder.addRow(row);
×
237
        // If a batch has been completed, emit it
×
238
        const batch = tableBatchBuilder && tableBatchBuilder.getFullBatch({bytesUsed});
×
239
        if (batch) {
×
240
          asyncQueue.enqueue(batch);
×
241
        }
×
242
      } catch (error) {
×
243
        asyncQueue.enqueue(error as Error);
×
244
      }
×
245
    },
×
246

×
247
    // complete is called when all rows have been read
×
248
    complete(results) {
×
249
      try {
×
250
        const bytesUsed = results.meta.cursor;
×
251
        // Ensure any final (partial) batch gets emitted
×
252
        const batch = tableBatchBuilder && tableBatchBuilder.getFinalBatch({bytesUsed});
×
253
        if (batch) {
×
254
          asyncQueue.enqueue(batch);
×
255
        }
×
256
      } catch (error) {
×
257
        asyncQueue.enqueue(error as Error);
×
258
      }
×
259

×
260
      asyncQueue.close();
×
261
    }
×
262
  };
×
263

×
264
  Papa.parse(toArrayBufferIterator(asyncIterator), config, AsyncIteratorStreamer);
×
265

×
266
  // TODO - Does it matter if we return asyncIterable or asyncIterator
×
267
  // return asyncQueue[Symbol.asyncIterator]();
×
268
  return asyncQueue;
×
269
}
×
270

1✔
271
/**
1✔
272
 * Checks if a certain row is a header row
1✔
273
 * @param row the row to check
1✔
274
 * @returns true if the row looks like a header
1✔
275
 */
1✔
276
function isHeaderRow(row: string[]): boolean {
×
277
  return row && row.every((value) => typeof value === 'string');
×
278
}
×
279

1✔
280
/**
1✔
281
 * Reads, parses, and returns the first row of a CSV text
1✔
282
 * @param csvText the csv text to parse
1✔
283
 * @returns the first row
1✔
284
 */
1✔
285
function readFirstRow(csvText: string): any[] {
×
286
  const result = Papa.parse(csvText, {
×
287
    dynamicTyping: true,
×
288
    preview: 1
×
289
  });
×
290
  return result.data[0];
×
291
}
×
292

1✔
293
/**
1✔
294
 * Creates a transformer that renames duplicate columns. This is needed as Papaparse doesn't handle
1✔
295
 * duplicate header columns and would use the latest occurrence by default.
1✔
296
 * See the header option in https://www.papaparse.com/docs#config
1✔
297
 * @returns a transform function that returns sanitized names for duplicate fields
1✔
298
 */
1✔
299
function duplicateColumnTransformer(): (column: string) => string {
×
300
  const observedColumns = new Set<string>();
×
301
  return (col) => {
×
302
    let colName = col;
×
303
    let counter = 1;
×
304
    while (observedColumns.has(colName)) {
×
305
      colName = `${col}.${counter}`;
×
306
      counter++;
×
307
    }
×
308
    observedColumns.add(colName);
×
309
    return colName;
×
310
  };
×
311
}
×
312

1✔
313
/**
1✔
314
 * Generates the header of a CSV given a prefix and a column count
1✔
315
 * @param columnPrefix the columnPrefix to use
1✔
316
 * @param count the count of column names to generate
1✔
317
 * @returns an array of column names
1✔
318
 */
1✔
319
function generateHeader(columnPrefix: string, count: number = 0): string[] {
×
320
  const headers: string[] = [];
×
321
  for (let i = 0; i < count; i++) {
×
322
    headers.push(`${columnPrefix}${i + 1}`);
×
323
  }
×
324
  return headers;
×
325
}
×
326

1✔
NEW
327
function normalizePapaStreamingRow(row: unknown[]): unknown[] {
×
NEW
328
  return row.map((value) => (Array.isArray(value) && value.length === 0 ? null : value));
×
NEW
329
}
×
330

1✔
NEW
331
function convertToPapaObjectRow(
×
NEW
332
  row: unknown[],
×
NEW
333
  headerRow: string[]
×
NEW
334
): {[columnName: string]: unknown} {
×
NEW
335
  const objectRow = convertToObjectRow(row, headerRow);
×
NEW
336
  const parsedExtra = row.slice(headerRow.length);
×
NEW
337
  if (parsedExtra.length > 0) {
×
NEW
338
    objectRow.__parsed_extra = parsedExtra;
×
NEW
339
  }
×
NEW
340
  return objectRow;
×
NEW
341
}
×
342

1✔
343
function deduceCSVSchema(row, headerRow): Schema {
×
344
  const fields: Schema['fields'] = [];
×
345
  for (let i = 0; i < row.length; i++) {
×
346
    const columnName = (headerRow && headerRow[i]) || i;
×
347
    const value = row[i];
×
348
    switch (typeof value) {
×
349
      case 'number':
×
350
        fields.push({name: String(columnName), type: 'float64', nullable: true});
×
351
        break;
×
352
      case 'boolean':
×
353
        fields.push({name: String(columnName), type: 'bool', nullable: true});
×
354
        break;
×
355
      case 'string':
×
356
        fields.push({name: String(columnName), type: 'utf8', nullable: true});
×
357
        break;
×
358
      default:
×
359
        log.warn(`CSV: Unknown column type: ${typeof value}`)();
×
360
        fields.push({name: String(columnName), type: 'utf8', nullable: true});
×
361
    }
×
362
  }
×
363
  return {
×
364
    fields,
×
365
    metadata: {
×
366
      'loaders.gl#format': 'csv',
×
367
      'loaders.gl#loader': 'CSVLoader'
×
368
    }
×
369
  };
×
370
}
×
371

1✔
372
// TODO - remove
1✔
373
// type ObjectField = {name: string; index: number; type: any};
1✔
374
// type ObjectSchema = {[key: string]: ObjectField} | ObjectField[];
1✔
375

1✔
376
// function deduceObjectSchema(row, headerRow): ObjectSchema {
1✔
377
//   const schema: ObjectSchema = headerRow ? {} : [];
1✔
378
//   for (let i = 0; i < row.length; i++) {
1✔
379
//     const columnName = (headerRow && headerRow[i]) || i;
1✔
380
//     const value = row[i];
1✔
381
//     switch (typeof value) {
1✔
382
//       case 'number':
1✔
383
//       case 'boolean':
1✔
384
//         // TODO - booleans could be handled differently...
1✔
385
//         schema[columnName] = {name: String(columnName), index: i, type: Float32Array};
1✔
386
//         break;
1✔
387
//       case 'string':
1✔
388
//       default:
1✔
389
//         schema[columnName] = {name: String(columnName), index: i, type: Array};
1✔
390
//       // We currently only handle numeric rows
1✔
391
//       // TODO we could offer a function to map strings to numbers?
1✔
392
//     }
1✔
393
//   }
1✔
394
//   return schema;
1✔
395
// }
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc