From 451975dc186e6bceebc4488653f0e38428e465e0 Mon Sep 17 00:00:00 2001 From: Ib Green Date: Wed, 16 Oct 2024 14:25:05 -0400 Subject: [PATCH 1/7] feat(csv): CSVArrowLoader --- modules/csv/src/csv-arrow-loader.ts | 41 ++++++++ modules/csv/src/index.ts | 3 + ...arrow.spec.ts => csv-arrow-loader.spec.ts} | 16 +-- modules/csv/test/csv-loader.spec.ts | 4 + modules/csv/test/csv-writer-papaparse.spec.ts | 4 + modules/csv/test/index.ts | 4 +- modules/schema-utils/src/index.ts | 42 +++++--- ...able-batch-aggregator-builders.ts.disabled | 0 .../arrow-table-batch-aggregator.ts | 0 .../base-table-batch-aggregator.ts | 0 .../columnar-table-batch-aggregator.ts | 0 .../row-table-batch-aggregator.ts | 0 .../table-batch-aggregator.ts | 0 .../table-batch-builder.ts | 0 .../src/lib/table/batches/convert-batches.ts | 83 ++++++++++++++++ .../batches/make-arrow-batch-iterator.ts | 75 ++++++++++++++ .../batches/make-table-batch-iterator.ts | 25 +++++ .../make-table-from-batches.ts | 21 +--- .../lib/table/tables/convert-arrow-table.ts | 98 ++++++++++--------- .../src/lib/table/tables/convert-table.ts | 40 ++++---- .../src/lib/table/tables/make-table.ts | 18 +++- .../src/lib/table/tables/table-types.ts | 37 +++++++ 22 files changed, 406 insertions(+), 105 deletions(-) create mode 100644 modules/csv/src/csv-arrow-loader.ts rename modules/csv/test/{csv-loader-arrow.spec.ts => csv-arrow-loader.spec.ts} (78%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/arrow-table-batch-aggregator-builders.ts.disabled (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/arrow-table-batch-aggregator.ts (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/base-table-batch-aggregator.ts (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/columnar-table-batch-aggregator.ts (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/row-table-batch-aggregator.ts (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/table-batch-aggregator.ts (100%) rename modules/schema-utils/src/lib/table/{batches => batch-builder}/table-batch-builder.ts (100%) create mode 100644 modules/schema-utils/src/lib/table/batches/convert-batches.ts create mode 100644 modules/schema-utils/src/lib/table/batches/make-arrow-batch-iterator.ts create mode 100644 modules/schema-utils/src/lib/table/batches/make-table-batch-iterator.ts rename modules/schema-utils/src/lib/table/{tables => batches}/make-table-from-batches.ts (74%) create mode 100644 modules/schema-utils/src/lib/table/tables/table-types.ts diff --git a/modules/csv/src/csv-arrow-loader.ts b/modules/csv/src/csv-arrow-loader.ts new file mode 100644 index 0000000000..78d3ce62f5 --- /dev/null +++ b/modules/csv/src/csv-arrow-loader.ts @@ -0,0 +1,41 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import type {LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils'; +import type {ArrowTable, ArrowTableBatch} from '@loaders.gl/schema'; +import {convertTable, convertBatches} from '@loaders.gl/schema-utils'; + +import type {CSVLoaderOptions} from './csv-loader'; +import {CSVLoader} from './csv-loader'; + +export type CSVArrowLoaderOptions = LoaderOptions & { + csv?: Omit; +}; + +export const CSVArrowLoader = { + ...CSVLoader, + + dataType: null as unknown as ArrowTable, + batchType: null as unknown as ArrowTableBatch, + + parse: async (arrayBuffer: ArrayBuffer, options?: CSVLoaderOptions) => + parseCSVToArrow(new TextDecoder().decode(arrayBuffer), options), + parseText: (text: string, options?: CSVLoaderOptions) => parseCSVToArrow(text, options), + parseInBatches: parseCSVToArrowBatches +} as const satisfies LoaderWithParser; + +async function parseCSVToArrow(csvText: string, options?: CSVLoaderOptions): Promise { + // Apps can call the parse method directly, we so apply default options here + // const csvOptions = {...CSVArrowLoader.options.csv, ...options?.csv}; + const table = await CSVLoader.parseText(csvText, options); + return convertTable(table, 'arrow-table'); +} + +function parseCSVToArrowBatches( + asyncIterator: AsyncIterable | Iterable, + options?: CSVArrowLoaderOptions +): AsyncIterable { + const tableIterator = CSVLoader.parseInBatches(asyncIterator, options); + return convertBatches(tableIterator, 'arrow-table'); +} diff --git a/modules/csv/src/index.ts b/modules/csv/src/index.ts index d25134bc75..3e00dd719f 100644 --- a/modules/csv/src/index.ts +++ b/modules/csv/src/index.ts @@ -7,3 +7,6 @@ export {CSVLoader} from './csv-loader'; export type {CSVWriterOptions} from './csv-writer'; export {CSVWriter} from './csv-writer'; + +export type {CSVArrowLoaderOptions} from './csv-arrow-loader'; +export {CSVArrowLoader} from './csv-arrow-loader'; diff --git a/modules/csv/test/csv-loader-arrow.spec.ts b/modules/csv/test/csv-arrow-loader.spec.ts similarity index 78% rename from modules/csv/test/csv-loader-arrow.spec.ts rename to modules/csv/test/csv-arrow-loader.spec.ts index 2a63337629..c0035c9886 100644 --- a/modules/csv/test/csv-loader-arrow.spec.ts +++ b/modules/csv/test/csv-arrow-loader.spec.ts @@ -1,15 +1,18 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + import test from 'tape-promise/tape'; import {loadInBatches, isIterator, isAsyncIterable} from '@loaders.gl/core'; -import {CSVLoader} from '../src/csv-loader'; // from '@loaders.gl/csv'; +import {CSVArrowLoader} from '@loaders.gl/csv'; import * as arrow from 'apache-arrow'; // Small CSV Sample Files const CSV_NUMBERS_100_URL = '@loaders.gl/csv/test/data/numbers-100.csv'; const CSV_NUMBERS_10000_URL = '@loaders.gl/csv/test/data/numbers-10000.csv'; -// TODO -restore -test.skip('CSVLoader#loadInBatches(numbers-100.csv, arrow)', async (t) => { - const iterator = await loadInBatches(CSV_NUMBERS_100_URL, CSVLoader, { +test('CSVArrowLoader#loadInBatches(numbers-100.csv, arrow)', async (t) => { + const iterator = await loadInBatches(CSV_NUMBERS_100_URL, CSVArrowLoader, { csv: { shape: 'arrow-table' }, @@ -29,9 +32,8 @@ test.skip('CSVLoader#loadInBatches(numbers-100.csv, arrow)', async (t) => { t.end(); }); -// TODO - restore -test.skip('CSVLoader#loadInBatches(numbers-10000.csv, arrow)', async (t) => { - const iterator = await loadInBatches(CSV_NUMBERS_10000_URL, CSVLoader, { +test('CSVArrowLoader#loadInBatches(numbers-10000.csv, arrow)', async (t) => { + const iterator = await loadInBatches(CSV_NUMBERS_10000_URL, CSVArrowLoader, { csv: { shape: 'arrow-table' }, diff --git a/modules/csv/test/csv-loader.spec.ts b/modules/csv/test/csv-loader.spec.ts index aafe93f5ed..66b0c3c5bd 100644 --- a/modules/csv/test/csv-loader.spec.ts +++ b/modules/csv/test/csv-loader.spec.ts @@ -1,3 +1,7 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + import test from 'tape-promise/tape'; import {validateLoader} from 'test/common/conformance'; diff --git a/modules/csv/test/csv-writer-papaparse.spec.ts b/modules/csv/test/csv-writer-papaparse.spec.ts index f3690960c9..84d90a5b6c 100644 --- a/modules/csv/test/csv-writer-papaparse.spec.ts +++ b/modules/csv/test/csv-writer-papaparse.spec.ts @@ -1,3 +1,7 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + // This is a fork of papaparse under MIT License // https://github.com/mholt/PapaParse /* eslint-disable */ diff --git a/modules/csv/test/index.ts b/modules/csv/test/index.ts index d2fc0b8f48..6ffc16977e 100644 --- a/modules/csv/test/index.ts +++ b/modules/csv/test/index.ts @@ -3,6 +3,6 @@ import './papaparse/papaparse.spec'; // import './csv-writer-papaparse.spec'; import './csv-loader.spec'; -import './csv-loader-arrow.spec'; - import './csv-writer.spec'; + +import './csv-arrow-loader.spec'; diff --git a/modules/schema-utils/src/index.ts b/modules/schema-utils/src/index.ts index c4ade48bfe..7dd92ed0d1 100644 --- a/modules/schema-utils/src/index.ts +++ b/modules/schema-utils/src/index.ts @@ -21,14 +21,33 @@ export { } from './lib/schema/convert-arrow-schema'; export {getDataTypeFromArray} from './lib/schema/data-type'; -// Table utils +// TABLE CATEGORY UTILS + +export {deduceTableSchema} from './lib/schema/deduce-table-schema'; +export {makeTableFromData} from './lib/table/tables/make-table'; +export {makeTableFromBatches} from './lib/table/batches/make-table-from-batches'; + +export {convertTable} from './lib/table/tables/convert-table'; +export {convertToObjectRow, convertToArrayRow} from './lib/table/tables/row-utils'; export {convertArrowToTable, convertTableToArrow} from './lib/table/tables/convert-arrow-table'; -// TABLE CATEGORY UTILS -export {TableBatchBuilder} from './lib/table/batches/table-batch-builder'; -export type {TableBatchAggregator} from './lib/table/batches/table-batch-aggregator'; -export {RowTableBatchAggregator} from './lib/table/batches/row-table-batch-aggregator'; -export {ColumnarTableBatchAggregator} from './lib/table/batches/columnar-table-batch-aggregator'; +export { + makeTableBatchIterator, + makeBatchFromTable +} from './lib/table/batches/make-table-batch-iterator'; +export { + makeArrowTableBatchIterator, + makeArrowRecordBatchIterator +} from './lib/table/batches/make-arrow-batch-iterator'; +export {convertBatch, convertBatches} from './lib/table/batches/convert-batches'; + +export { + isArrayRowTable, + isObjectRowTable, + isColumnarTable, + isGeoJSONTable, + isArrowTable +} from './lib/table/tables/table-types'; export { isTable, @@ -46,11 +65,12 @@ export { makeObjectRowIterator } from './lib/table/tables/table-accessors'; -export {makeTableFromData} from './lib/table/tables/make-table'; -export {makeTableFromBatches, makeBatchFromTable} from './lib/table/tables/make-table-from-batches'; -export {convertTable} from './lib/table/tables/convert-table'; -export {deduceTableSchema} from './lib/schema/deduce-table-schema'; -export {convertToObjectRow, convertToArrayRow} from './lib/table/tables/row-utils'; +// Table batch builders + +export {TableBatchBuilder} from './lib/table/batch-builder/table-batch-builder'; +export type {TableBatchAggregator} from './lib/table/batch-builder/table-batch-aggregator'; +export {RowTableBatchAggregator} from './lib/table/batch-builder/row-table-batch-aggregator'; +export {ColumnarTableBatchAggregator} from './lib/table/batch-builder/columnar-table-batch-aggregator'; export {ArrowLikeTable} from './lib/table/arrow-api/arrow-like-table'; diff --git a/modules/schema-utils/src/lib/table/batches/arrow-table-batch-aggregator-builders.ts.disabled b/modules/schema-utils/src/lib/table/batch-builder/arrow-table-batch-aggregator-builders.ts.disabled similarity index 100% rename from modules/schema-utils/src/lib/table/batches/arrow-table-batch-aggregator-builders.ts.disabled rename to modules/schema-utils/src/lib/table/batch-builder/arrow-table-batch-aggregator-builders.ts.disabled diff --git a/modules/schema-utils/src/lib/table/batches/arrow-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/arrow-table-batch-aggregator.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/arrow-table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/arrow-table-batch-aggregator.ts diff --git a/modules/schema-utils/src/lib/table/batches/base-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/base-table-batch-aggregator.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/base-table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/base-table-batch-aggregator.ts diff --git a/modules/schema-utils/src/lib/table/batches/columnar-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/columnar-table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts diff --git a/modules/schema-utils/src/lib/table/batches/row-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/row-table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts diff --git a/modules/schema-utils/src/lib/table/batches/table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/table-batch-aggregator.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/table-batch-aggregator.ts rename to modules/schema-utils/src/lib/table/batch-builder/table-batch-aggregator.ts diff --git a/modules/schema-utils/src/lib/table/batches/table-batch-builder.ts b/modules/schema-utils/src/lib/table/batch-builder/table-batch-builder.ts similarity index 100% rename from modules/schema-utils/src/lib/table/batches/table-batch-builder.ts rename to modules/schema-utils/src/lib/table/batch-builder/table-batch-builder.ts diff --git a/modules/schema-utils/src/lib/table/batches/convert-batches.ts b/modules/schema-utils/src/lib/table/batches/convert-batches.ts new file mode 100644 index 0000000000..a030d30727 --- /dev/null +++ b/modules/schema-utils/src/lib/table/batches/convert-batches.ts @@ -0,0 +1,83 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import type { + TableBatch, + ArrayRowTableBatch, + ObjectRowTableBatch, + ColumnarTableBatch, + ArrowTableBatch +} from '@loaders.gl/schema'; +import {convertTable} from '../tables/convert-table'; + +export function convertBatch(batches: TableBatch, shape: 'object-row-table'): ObjectRowTableBatch; +export function convertBatch(batches: TableBatch, shape: 'array-row-table'): ArrayRowTableBatch; +export function convertBatch(batches: TableBatch, shape: 'columnar-table'): ColumnarTableBatch; +export function convertBatch(batches: TableBatch, shape: 'arrow-table'): ArrowTableBatch; + +/** Convert a table batch to a different shape */ +export function convertBatch( + batch: TableBatch, + shape: 'object-row-table' | 'array-row-table' | 'columnar-table' | 'arrow-table' +): TableBatch { + switch (batch.shape) { + case 'object-row-table': + return {...batch, ...convertTable(batch, 'object-row-table')}; + case 'array-row-table': + return {...batch, ...convertTable(batch, 'array-row-table')}; + case 'columnar-table': + return {...batch, ...convertTable(batch, 'columnar-table')}; + case 'arrow-table': + return {...batch, ...convertTable(batch, 'arrow-table')}; + default: + throw new Error(shape); + } +} + +export function convertBatches( + batches: Iterable | AsyncIterable, + shape: 'object-row-table' +): AsyncIterableIterator; +export function convertBatches( + batches: Iterable | AsyncIterable, + shape: 'array-row-table' +): AsyncIterableIterator; +export function convertBatches( + batches: Iterable | AsyncIterable, + shape: 'columnar-table' +): AsyncIterableIterator; +export function convertBatches( + batches: Iterable | AsyncIterable, + shape: 'arrow-table' +): AsyncIterableIterator; + +/** + * Convert batches to a different shape + * @param table + * @param shape + * @returns + */ +export async function* convertBatches( + batches: Iterable | AsyncIterable, + shape: 'object-row-table' | 'array-row-table' | 'columnar-table' | 'arrow-table' +): AsyncIterableIterator { + for await (const batch of batches) { + switch (shape) { + case 'object-row-table': + yield convertBatch(batch, 'object-row-table'); + break; + case 'array-row-table': + yield convertBatch(batch, 'array-row-table'); + break; + case 'columnar-table': + yield convertBatch(batch, 'columnar-table'); + break; + case 'arrow-table': + yield convertBatch(batch, 'arrow-table'); + break; + default: + throw new Error(shape); + } + } +} diff --git a/modules/schema-utils/src/lib/table/batches/make-arrow-batch-iterator.ts b/modules/schema-utils/src/lib/table/batches/make-arrow-batch-iterator.ts new file mode 100644 index 0000000000..8a7fc1756a --- /dev/null +++ b/modules/schema-utils/src/lib/table/batches/make-arrow-batch-iterator.ts @@ -0,0 +1,75 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import * as arrow from 'apache-arrow'; +import type {Table, ArrowTableBatch} from '@loaders.gl/schema'; + +import {convertSchemaToArrow} from '../../schema/convert-arrow-schema'; +import {getTableLength, getTableNumCols, getTableCellAt} from '../tables/table-accessors'; + +/** + * Returns an iterator that yields a single table as a sequence of ArrowTable batches. + * @note All batches will have the same shape and schema as the original table. + */ +export function* makeArrowTableBatchIterator( + table: Table, + options?: {batchSize?: number} +): IterableIterator { + for (const batch of makeArrowRecordBatchIterator(table, options)) { + const arrowTable = new arrow.Table([batch]); + yield { + ...batch, + shape: 'arrow-table', + schema: table.schema, + batchType: 'data', + length: arrowTable.numRows, + data: arrowTable + }; + } +} + +/** + * Returns an iterator that yields a single table as a sequence of arrow.RecordBatch batches. + * @note All batches will have the same shape and schema as the original table. + */ +export function* makeArrowRecordBatchIterator( + table: Table, + options?: {batchSize?: number} +): IterableIterator { + const arrowSchema = convertSchemaToArrow(table.schema!); + + const length = getTableLength(table); + const numColumns = getTableNumCols(table); + const batchSize = options?.batchSize || length; + + const builders = arrowSchema?.fields.map((arrowField) => arrow.makeBuilder(arrowField)); + const structField = new arrow.Struct(arrowSchema.fields); + + let batchLength = 0; + for (let rowIndex = 0; rowIndex < length; rowIndex++) { + for (let columnIndex = 0; columnIndex < numColumns; ++columnIndex) { + const value = getTableCellAt(table, rowIndex, columnIndex); + + const builder = builders[columnIndex]; + builder.append(value); + batchLength++; + + if (batchLength >= batchSize) { + const datas = builders.map((builder) => builder.flush()); + const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas); + yield new arrow.RecordBatch(arrowSchema, structData); + batchLength = 0; + } + } + } + + if (batchLength > 0) { + const datas = builders.map((builder) => builder.flush()); + const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas); + yield new arrow.RecordBatch(arrowSchema, structData); + batchLength = 0; + } + + builders.map((builder) => builder.finish()); +} diff --git a/modules/schema-utils/src/lib/table/batches/make-table-batch-iterator.ts b/modules/schema-utils/src/lib/table/batches/make-table-batch-iterator.ts new file mode 100644 index 0000000000..7d1a23e3f6 --- /dev/null +++ b/modules/schema-utils/src/lib/table/batches/make-table-batch-iterator.ts @@ -0,0 +1,25 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import type {TableBatch, Table} from '@loaders.gl/schema'; +import {getTableLength} from '../tables/table-accessors'; + +/** + * Returns an iterator that yields the contents of a table as a sequence of batches. + * @todo Currently only a single batch is yielded. + * @note All batches will have the same shape and schema as the original table. + * @returns + */ +export function* makeTableBatchIterator(table: Table): IterableIterator { + yield makeBatchFromTable(table); +} + +/** + * Returns a table packaged as a single table batch + * @note The batch will have the same shape and schema as the original table. + * @returns `null` if no batches are yielded by the async iterator + */ +export function makeBatchFromTable(table: Table): TableBatch { + return {...table, length: getTableLength(table), batchType: 'data'}; +} diff --git a/modules/schema-utils/src/lib/table/tables/make-table-from-batches.ts b/modules/schema-utils/src/lib/table/batches/make-table-from-batches.ts similarity index 74% rename from modules/schema-utils/src/lib/table/tables/make-table-from-batches.ts rename to modules/schema-utils/src/lib/table/batches/make-table-from-batches.ts index edf4ce4aaa..0739240430 100644 --- a/modules/schema-utils/src/lib/table/tables/make-table-from-batches.ts +++ b/modules/schema-utils/src/lib/table/batches/make-table-from-batches.ts @@ -10,26 +10,7 @@ import type { ArrayRowTable, Feature } from '@loaders.gl/schema'; -import {getTableLength} from './table-accessors'; - -/** - * Returns an iterator that yields a single table as a sequence of batches. - * @note Currently only a single batch is yielded. - * @note All batches will have the same shape and schema as the original table. - * @returns - */ -export function* makeBatchesFromTable(table: Table): IterableIterator { - yield makeBatchFromTable(table); -} - -/** - * Returns a table packaged as a single table batch - * @note The batch will have the same shape and schema as the original table. - * @returns `null` if no batches are yielded by the async iterator - */ -export function makeBatchFromTable(table: Table): TableBatch { - return {...table, length: getTableLength(table), batchType: 'data'}; -} +import {getTableLength} from '../tables/table-accessors'; /** * Assembles all batches from an async iterator into a single table. diff --git a/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts b/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts index 63a27d602d..05c6bebceb 100644 --- a/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts +++ b/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts @@ -14,9 +14,8 @@ import type { } from '@loaders.gl/schema'; import {convertTable} from './convert-table'; -import {convertArrowToSchema, convertSchemaToArrow} from '../../schema/convert-arrow-schema'; -import {getTableLength, getTableNumCols, getTableCellAt} from './table-accessors'; - +import {convertArrowToSchema} from '../../schema/convert-arrow-schema'; +import {makeArrowRecordBatchIterator} from '../batches/make-arrow-batch-iterator'; /** * * Convert a loaders.gl Table to an Apache Arrow Table * @param mesh @@ -36,52 +35,11 @@ export function convertTableToArrow(table: Table, options?: {batchSize?: number} // fall through default: - const arrowBatchIterator = makeTableToArrowBatchesIterator(table, options); + const arrowBatchIterator = makeArrowRecordBatchIterator(table, options); return new arrow.Table(arrowBatchIterator); } } -export function* makeTableToArrowBatchesIterator( - table: Table, - options?: {batchSize?: number} -): IterableIterator { - const arrowSchema = convertSchemaToArrow(table.schema!); - - const length = getTableLength(table); - const numColumns = getTableNumCols(table); - const batchSize = options?.batchSize || length; - - const builders = arrowSchema?.fields.map((arrowField) => arrow.makeBuilder(arrowField)); - const structField = new arrow.Struct(arrowSchema.fields); - - let batchLength = 0; - for (let rowIndex = 0; rowIndex < length; rowIndex++) { - for (let columnIndex = 0; columnIndex < numColumns; ++columnIndex) { - const value = getTableCellAt(table, rowIndex, columnIndex); - - const builder = builders[columnIndex]; - builder.append(value); - batchLength++; - - if (batchLength >= batchSize) { - const datas = builders.map((builder) => builder.flush()); - const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas); - yield new arrow.RecordBatch(arrowSchema, structData); - batchLength = 0; - } - } - } - - if (batchLength > 0) { - const datas = builders.map((builder) => builder.flush()); - const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas); - yield new arrow.RecordBatch(arrowSchema, structData); - batchLength = 0; - } - - builders.map((builder) => builder.finish()); -} - /** * Convert an Apache Arrow table to a loaders.gl Table * @note Currently does not convert schema @@ -187,3 +145,53 @@ function convertArrowToGeoJSONTable(arrowTable: arrow.Table): GeoJSONTable { features }; } + +// /** +// * Wrap an apache arrow table in a loaders.gl table wrapper. +// * From this additional conversions are available. +// * @param arrowTable +// * @returns +// */ +// function convertArrowToArrowTable(arrowTable: arrow.Table): ArrowTable { +// return { +// shape: 'arrow-table', +// schema: convertArrowToSchema(arrowTable.schema), +// data: arrowTable +// }; +// } + +// function convertArrowToArrayRowTable(arrowTable: arrow.Table): Table { +// const columnarTable = convertArrowToColumnarTable(arrowTable); +// return convertTable(columnarTable, 'array-row-table'); +// } + +// function convertArrowToObjectRowTable(arrowTable: arrow.Table): Table { +// const columnarTable = convertArrowToColumnarTable(arrowTable); +// return convertTable(columnarTable, 'object-row-table'); +// } + +// /** +// * Convert an Apache Arrow table to a ColumnarTable +// * @note Currently does not convert schema +// */ +// function convertArrowToColumnarTable(arrowTable: arrow.Table): ColumnarTable { +// // TODO - avoid calling `getColumn` on columns we are not interested in? +// // Add options object? + +// const columns: ColumnarTable['data'] = {}; + +// for (const field of arrowTable.schema.fields) { +// // This (is intended to) coalesce all record batches into a single typed array +// const arrowColumn = arrowTable.getChild(field.name); +// const values = arrowColumn?.toArray(); +// columns[field.name] = values; +// } + +// const schema = convertArrowToSchema(arrowTable.schema); + +// return { +// shape: 'columnar-table', +// schema, +// data: columns +// }; +// } diff --git a/modules/schema-utils/src/lib/table/tables/convert-table.ts b/modules/schema-utils/src/lib/table/tables/convert-table.ts index caa6efcf5f..27fb89e4d7 100644 --- a/modules/schema-utils/src/lib/table/tables/convert-table.ts +++ b/modules/schema-utils/src/lib/table/tables/convert-table.ts @@ -17,6 +17,7 @@ import { } from './table-accessors'; import {deduceTableSchema} from '../../schema/deduce-table-schema'; import {makeColumnFromField} from './table-column'; +import {convertTableToArrow} from './convert-arrow-table'; export function convertTable(table: Table, shape: 'object-row-table'): ObjectRowTable; export function convertTable(table: Table, shape: 'array-row-table'): ArrayRowTable; @@ -35,32 +36,20 @@ export function convertTable( ) { switch (shape) { case 'object-row-table': - return makeObjectRowTable(table); + return convertToObjectRowTable(table); case 'array-row-table': - return makeArrayRowTable(table); + return convertToArrayRowTable(table); case 'columnar-table': - return makeColumnarTable(table); + return convertToColumnarTable(table); case 'arrow-table': - return makeArrowTable(table); + return convertToArrowTable(table); default: throw new Error(shape); } } -/** - * Convert a table to apache arrow format - * @note this depends on the `@loaders.gl/arrow module being imported - */ -export function makeArrowTable(table: Table): Table { - const _makeArrowTable = globalThis.__loaders?._makeArrowTable; - if (!_makeArrowTable) { - throw new Error(''); - } - return _makeArrowTable(table); -} - /** Convert any simple table into columnar format */ -export function makeColumnarTable(table: Table): ColumnarTable { +export function convertToColumnarTable(table: Table): ColumnarTable { // TODO - should schema really be optional? const schema = table.schema || deduceTableSchema(table); const fields = table.schema?.fields || []; @@ -88,7 +77,7 @@ export function makeColumnarTable(table: Table): ColumnarTable { } /** Convert any table into array row format */ -export function makeArrayRowTable(table: Table): ArrayRowTable { +export function convertToArrayRowTable(table: Table): ArrayRowTable { if (table.shape === 'array-row-table') { return table; } @@ -105,7 +94,7 @@ export function makeArrayRowTable(table: Table): ArrayRowTable { } /** Convert any table into object row format */ -export function makeObjectRowTable(table: Table): ObjectRowTable { +export function convertToObjectRowTable(table: Table): ObjectRowTable { if (table.shape === 'object-row-table') { return table; } @@ -121,6 +110,19 @@ export function makeObjectRowTable(table: Table): ObjectRowTable { }; } +/** + * Convert a table to apache arrow format + * @note this depends on the `@loaders.gl/arrow module being imported + */ +export function convertToArrowTable(table: Table): ArrowTable { + const arrowTable = convertTableToArrow(table); + return { + shape: 'arrow-table', + schema: table.schema, + data: arrowTable + }; +} + /** * * @note - should be part of schema module diff --git a/modules/schema-utils/src/lib/table/tables/make-table.ts b/modules/schema-utils/src/lib/table/tables/make-table.ts index 3ccb1f3879..e50ffc450d 100644 --- a/modules/schema-utils/src/lib/table/tables/make-table.ts +++ b/modules/schema-utils/src/lib/table/tables/make-table.ts @@ -2,7 +2,14 @@ // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors -import type {Table, ArrayRowTable, ObjectRowTable, ColumnarTable} from '@loaders.gl/schema'; +import * as arrow from 'apache-arrow'; +import type { + Table, + ArrayRowTable, + ObjectRowTable, + ColumnarTable, + ArrowTable +} from '@loaders.gl/schema'; import {deduceTableSchema} from '../../schema/deduce-table-schema'; /** @@ -12,6 +19,8 @@ import {deduceTableSchema} from '../../schema/deduce-table-schema'; export function makeTableFromData(data: unknown[][]): ArrayRowTable; export function makeTableFromData(data: {[column: string]: unknown}[]): ObjectRowTable; export function makeTableFromData(data: {[column: string]: ArrayLike}): ColumnarTable; +export function makeTableFromData(data: arrow.Table): ArrowTable; + export function makeTableFromData(data: unknown): Table { let table: Table; switch (getTableShapeFromData(data)) { @@ -24,6 +33,9 @@ export function makeTableFromData(data: unknown): Table { case 'columnar-table': table = {shape: 'columnar-table', data: data as {[column: string]: ArrayLike}}; break; + case 'arrow-table': + table = {shape: 'arrow-table', data: data as arrow.Table}; + break; default: throw new Error('table'); } @@ -33,6 +45,10 @@ export function makeTableFromData(data: unknown): Table { /** Helper function to get shape of data */ function getTableShapeFromData(data) { + if (data instanceof arrow.Table) { + return 'arrow-table'; + } + if (Array.isArray(data)) { if (data.length === 0) { throw new Error('cannot deduce type of empty table'); diff --git a/modules/schema-utils/src/lib/table/tables/table-types.ts b/modules/schema-utils/src/lib/table/tables/table-types.ts new file mode 100644 index 0000000000..965b9d050e --- /dev/null +++ b/modules/schema-utils/src/lib/table/tables/table-types.ts @@ -0,0 +1,37 @@ +// loaders.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import type { + Table, + ObjectRowTable, + ArrayRowTable, + ColumnarTable, + GeoJSONTable, + ArrowTable +} from '@loaders.gl/schema'; + +/** Checks if a table is of array row layout */ +export function isArrayRowTable(table: Table): table is ArrayRowTable { + return table.shape === 'array-row-table'; +} + +/** Checks if a table is of array row layout */ +export function isObjectRowTable(table: Table): table is ObjectRowTable { + return table.shape === 'object-row-table'; +} + +/** Checks if a table is of columnar layout */ +export function isColumnarTable(table: Table): table is ColumnarTable { + return table.shape === 'columnar-table'; +} + +/** Checks if a table is of GeoJSON format */ +export function isGeoJSONTable(table: Table): table is GeoJSONTable { + return table.shape === 'geojson-table'; +} + +/** Checks if table wraps an Apache Arrow table */ +export function isArrowTable(table: Table): table is ArrowTable { + return table.shape === 'arrow-table'; +} From bbad3227553ca51d961bd08a80a80ed6c8395718 Mon Sep 17 00:00:00 2001 From: Ib Green Date: Wed, 16 Oct 2024 16:54:35 -0400 Subject: [PATCH 2/7] fix --- modules/csv/src/csv-loader.ts | 42 +++++++++++++++---- modules/csv/test/csv-arrow-loader.spec.ts | 26 ++++++++---- .../src/lib/table/batches/convert-batches.ts | 2 +- 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/modules/csv/src/csv-loader.ts b/modules/csv/src/csv-loader.ts index 7abafba5a1..46fea8a806 100644 --- a/modules/csv/src/csv-loader.ts +++ b/modules/csv/src/csv-loader.ts @@ -3,10 +3,11 @@ // Copyright (c) vis.gl contributors import type {LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils'; -import type {ArrayRowTable, ObjectRowTable, TableBatch} from '@loaders.gl/schema'; +import type {Schema, ArrayRowTable, ObjectRowTable, TableBatch} from '@loaders.gl/schema'; import { AsyncQueue, + deduceTableSchema, TableBatchBuilder, convertToArrayRow, convertToObjectRow @@ -89,7 +90,7 @@ async function parseCSV( csvText: string, options?: CSVLoaderOptions ): Promise { - // Apps can call the parse method directly, we so apply default options here + // Apps can call the parse method directly, so we apply default options here const csvOptions = {...CSVLoader.options.csv, ...options?.csv}; const firstRow = readFirstRow(csvText); @@ -115,20 +116,25 @@ async function parseCSV( const headerRow = result.meta.fields || generateHeader(csvOptions.columnPrefix, firstRow.length); const shape = csvOptions.shape || DEFAULT_CSV_SHAPE; + let table: ArrayRowTable | ObjectRowTable; switch (shape) { case 'object-row-table': - return { + table = { shape: 'object-row-table', data: rows.map((row) => (Array.isArray(row) ? convertToObjectRow(row, headerRow) : row)) }; + break; case 'array-row-table': - return { + table = { shape: 'array-row-table', data: rows.map((row) => (Array.isArray(row) ? row : convertToArrayRow(row, headerRow))) }; + break; default: throw new Error(shape); } + table.schema = deduceTableSchema(table!); + return table; } // TODO - support batch size 0 = no batching/single batch? @@ -151,7 +157,7 @@ function parseCSVInBatches( let isFirstRow: boolean = true; let headerRow: string[] | null = null; let tableBatchBuilder: TableBatchBuilder | null = null; - let schema: ObjectSchema | null = null; + let schema: Schema | null = null; const config = { // dynamicTyping: true, // Convert numbers and boolean values in rows from strings, @@ -199,7 +205,7 @@ function parseCSVInBatches( if (!headerRow) { headerRow = generateHeader(csvOptions.columnPrefix, row.length); } - schema = deduceSchema(row, headerRow); + schema = deduceCSVSchema(row, headerRow); } if (csvOptions.optimizeMemoryUsage) { @@ -314,7 +320,29 @@ function generateHeader(columnPrefix: string, count: number = 0): string[] { return headers; } -function deduceSchema(row, headerRow): ObjectSchema { +function deduceCSVSchema(row, headerRow): Schema { + const fields: Schema['fields'] = []; + for (let i = 0; i < row.length; i++) { + const columnName = (headerRow && headerRow[i]) || i; + const value = row[i]; + switch (typeof value) { + case 'number': + fields.push({name: String(columnName), type: 'float64', nullable: true}); + break; + case 'boolean': + fields.push({name: String(columnName), type: 'bool', nullable: true}); + break; + case 'string': + default: + fields.push({name: String(columnName), type: 'utf8', nullable: true}); + // We currently only handle numeric rows + // TODO we could offer a function to map strings to numbers? + } + } + return {fields, metadata: {'loaders.gl': 'CSVLoader'}}; +} + +function deduceObjectSchema(row, headerRow): ObjectSchema { const schema: ObjectSchema = headerRow ? {} : []; for (let i = 0; i < row.length; i++) { const columnName = (headerRow && headerRow[i]) || i; diff --git a/modules/csv/test/csv-arrow-loader.spec.ts b/modules/csv/test/csv-arrow-loader.spec.ts index c0035c9886..274b37e6e4 100644 --- a/modules/csv/test/csv-arrow-loader.spec.ts +++ b/modules/csv/test/csv-arrow-loader.spec.ts @@ -10,12 +10,10 @@ import * as arrow from 'apache-arrow'; // Small CSV Sample Files const CSV_NUMBERS_100_URL = '@loaders.gl/csv/test/data/numbers-100.csv'; const CSV_NUMBERS_10000_URL = '@loaders.gl/csv/test/data/numbers-10000.csv'; +const CSV_INCIDENTS_URL_QUOTES = '@loaders.gl/csv/test/data/sf_incidents-small.csv'; -test('CSVArrowLoader#loadInBatches(numbers-100.csv, arrow)', async (t) => { +test('CSVArrowLoader#loadInBatches(numbers-100.csv)', async (t) => { const iterator = await loadInBatches(CSV_NUMBERS_100_URL, CSVArrowLoader, { - csv: { - shape: 'arrow-table' - }, batchSize: 40 }); @@ -32,11 +30,8 @@ test('CSVArrowLoader#loadInBatches(numbers-100.csv, arrow)', async (t) => { t.end(); }); -test('CSVArrowLoader#loadInBatches(numbers-10000.csv, arrow)', async (t) => { +test('CSVArrowLoader#loadInBatches(numbers-10000.csv)', async (t) => { const iterator = await loadInBatches(CSV_NUMBERS_10000_URL, CSVArrowLoader, { - csv: { - shape: 'arrow-table' - }, batchSize: 2000 }); t.ok(isIterator(iterator) || isAsyncIterable(iterator), 'loadInBatches returned iterator'); @@ -51,3 +46,18 @@ test('CSVArrowLoader#loadInBatches(numbers-10000.csv, arrow)', async (t) => { t.end(); }); + +test('CSVArrowLoader#loadInBatches(incidents.csv)', async (t) => { + const iterator = await loadInBatches(CSV_INCIDENTS_URL_QUOTES, CSVArrowLoader); + t.ok(isIterator(iterator) || isAsyncIterable(iterator), 'loadInBatches returned iterator'); + + let batchCount = 0; + for await (const batch of iterator) { + t.ok(batch.data instanceof arrow.Table, 'returns arrow RecordBatch'); + // t.comment(`BATCH: ${batch.length}`); + batchCount++; + } + t.equal(batchCount, 5, 'Correct number of batches received'); + + t.end(); +}); diff --git a/modules/schema-utils/src/lib/table/batches/convert-batches.ts b/modules/schema-utils/src/lib/table/batches/convert-batches.ts index a030d30727..82734a6771 100644 --- a/modules/schema-utils/src/lib/table/batches/convert-batches.ts +++ b/modules/schema-utils/src/lib/table/batches/convert-batches.ts @@ -21,7 +21,7 @@ export function convertBatch( batch: TableBatch, shape: 'object-row-table' | 'array-row-table' | 'columnar-table' | 'arrow-table' ): TableBatch { - switch (batch.shape) { + switch (shape) { case 'object-row-table': return {...batch, ...convertTable(batch, 'object-row-table')}; case 'array-row-table': From 92055778695deee9e58f6f6df3626affff92a41b Mon Sep 17 00:00:00 2001 From: Ib Green Date: Wed, 16 Oct 2024 22:18:26 -0400 Subject: [PATCH 3/7] lint --- modules/csv/src/csv-loader.ts | 56 ++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/modules/csv/src/csv-loader.ts b/modules/csv/src/csv-loader.ts index 46fea8a806..6901544610 100644 --- a/modules/csv/src/csv-loader.ts +++ b/modules/csv/src/csv-loader.ts @@ -320,7 +320,7 @@ function generateHeader(columnPrefix: string, count: number = 0): string[] { return headers; } -function deduceCSVSchema(row, headerRow): Schema { +function deduceCSVSchema(row, headerRow, logger: Logger): Schema { const fields: Schema['fields'] = []; for (let i = 0; i < row.length; i++) { const columnName = (headerRow && headerRow[i]) || i; @@ -333,32 +333,40 @@ function deduceCSVSchema(row, headerRow): Schema { fields.push({name: String(columnName), type: 'bool', nullable: true}); break; case 'string': - default: fields.push({name: String(columnName), type: 'utf8', nullable: true}); - // We currently only handle numeric rows - // TODO we could offer a function to map strings to numbers? - } - } - return {fields, metadata: {'loaders.gl': 'CSVLoader'}}; -} - -function deduceObjectSchema(row, headerRow): ObjectSchema { - const schema: ObjectSchema = headerRow ? {} : []; - for (let i = 0; i < row.length; i++) { - const columnName = (headerRow && headerRow[i]) || i; - const value = row[i]; - switch (typeof value) { - case 'number': - case 'boolean': - // TODO - booleans could be handled differently... - schema[columnName] = {name: String(columnName), index: i, type: Float32Array}; break; - case 'string': default: - schema[columnName] = {name: String(columnName), index: i, type: Array}; - // We currently only handle numeric rows - // TODO we could offer a function to map strings to numbers? + logger + fields.push({name: String(columnName), type: 'utf8', nullable: true}); } } - return schema; + return { + fields, + metadata: { + 'loaders.gl#format': 'csv', + 'loaders.gl#loader': 'CSVLoader' + } + }; } + +// TODO - remove +// function deduceObjectSchema(row, headerRow): ObjectSchema { +// const schema: ObjectSchema = headerRow ? {} : []; +// for (let i = 0; i < row.length; i++) { +// const columnName = (headerRow && headerRow[i]) || i; +// const value = row[i]; +// switch (typeof value) { +// case 'number': +// case 'boolean': +// // TODO - booleans could be handled differently... +// schema[columnName] = {name: String(columnName), index: i, type: Float32Array}; +// break; +// case 'string': +// default: +// schema[columnName] = {name: String(columnName), index: i, type: Array}; +// // We currently only handle numeric rows +// // TODO we could offer a function to map strings to numbers? +// } +// } +// return schema; +// } From 8da8e0ea7095744d4375d3c6e46df3dc6a27a6aa Mon Sep 17 00:00:00 2001 From: Ib Green Date: Wed, 16 Oct 2024 22:33:25 -0400 Subject: [PATCH 4/7] fix --- modules/csv/src/csv-loader.ts | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/modules/csv/src/csv-loader.ts b/modules/csv/src/csv-loader.ts index 6901544610..7329b740c0 100644 --- a/modules/csv/src/csv-loader.ts +++ b/modules/csv/src/csv-loader.ts @@ -5,6 +5,7 @@ import type {LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils'; import type {Schema, ArrayRowTable, ObjectRowTable, TableBatch} from '@loaders.gl/schema'; +import {log} from '@loaders.gl/loader-utils'; import { AsyncQueue, deduceTableSchema, @@ -15,9 +16,6 @@ import { import Papa from './papaparse/papaparse'; import AsyncIteratorStreamer from './papaparse/async-iterator-streamer'; -type ObjectField = {name: string; index: number; type: any}; -type ObjectSchema = {[key: string]: ObjectField} | ObjectField[]; - // __VERSION__ is injected by babel-plugin-version-inline // @ts-ignore TS2304: Cannot find name '__VERSION__'. const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest'; @@ -320,7 +318,7 @@ function generateHeader(columnPrefix: string, count: number = 0): string[] { return headers; } -function deduceCSVSchema(row, headerRow, logger: Logger): Schema { +function deduceCSVSchema(row, headerRow): Schema { const fields: Schema['fields'] = []; for (let i = 0; i < row.length; i++) { const columnName = (headerRow && headerRow[i]) || i; @@ -336,7 +334,7 @@ function deduceCSVSchema(row, headerRow, logger: Logger): Schema { fields.push({name: String(columnName), type: 'utf8', nullable: true}); break; default: - logger + log.warn(`CSV: Unknown column type: ${typeof value}`)(); fields.push({name: String(columnName), type: 'utf8', nullable: true}); } } @@ -350,6 +348,9 @@ function deduceCSVSchema(row, headerRow, logger: Logger): Schema { } // TODO - remove +// type ObjectField = {name: string; index: number; type: any}; +// type ObjectSchema = {[key: string]: ObjectField} | ObjectField[]; + // function deduceObjectSchema(row, headerRow): ObjectSchema { // const schema: ObjectSchema = headerRow ? {} : []; // for (let i = 0; i < row.length; i++) { From 5c14a22bc5bcc1eb18de71a573371f67e1f9250d Mon Sep 17 00:00:00 2001 From: Ib Green Date: Thu, 17 Oct 2024 11:10:19 -0400 Subject: [PATCH 5/7] fix schema handling in BatchAggregators --- modules/csv/test/csv-arrow-loader.spec.ts | 2 +- modules/csv/test/csv-loader.spec.ts | 1 + .../columnar-table-batch-aggregator.ts | 34 ++++++------------- .../row-table-batch-aggregator.ts | 4 +-- .../lib/table/tables/convert-arrow-table.ts | 1 + 5 files changed, 15 insertions(+), 27 deletions(-) diff --git a/modules/csv/test/csv-arrow-loader.spec.ts b/modules/csv/test/csv-arrow-loader.spec.ts index 274b37e6e4..2c1bdef38a 100644 --- a/modules/csv/test/csv-arrow-loader.spec.ts +++ b/modules/csv/test/csv-arrow-loader.spec.ts @@ -57,7 +57,7 @@ test('CSVArrowLoader#loadInBatches(incidents.csv)', async (t) => { // t.comment(`BATCH: ${batch.length}`); batchCount++; } - t.equal(batchCount, 5, 'Correct number of batches received'); + t.equal(batchCount, 1, 'Correct number of batches received'); t.end(); }); diff --git a/modules/csv/test/csv-loader.spec.ts b/modules/csv/test/csv-loader.spec.ts index 66b0c3c5bd..0f433bb341 100644 --- a/modules/csv/test/csv-loader.spec.ts +++ b/modules/csv/test/csv-loader.spec.ts @@ -247,6 +247,7 @@ test('CSVLoader#loadInBatches(sample.csv, object-rows)', async (t) => { // `BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}` // ); t.equal(batch.length, 2, 'Got correct batch size'); + debugger t.deepEqual(batch.data[0], {column1: 'A', column2: 'B', column3: 1}, 'Got correct first row'); } batchCount++; diff --git a/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts index e919a6e99a..c62f05e891 100644 --- a/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts +++ b/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts @@ -2,9 +2,9 @@ // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors -import type {Schema, ColumnarTableBatch, ArrowTableBatch} from '@loaders.gl/schema'; +import type {Schema, ColumnarTableBatch, ArrowTableBatch, TypedArray} from '@loaders.gl/schema'; +import {getArrayTypeFromDataType} from '../../schema/data-type'; import {TableBatchAggregator} from './table-batch-aggregator'; - type ColumnarTableBatchOptions = {}; const DEFAULT_ROW_COUNT = 100; @@ -13,7 +13,7 @@ export class ColumnarTableBatchAggregator implements TableBatchAggregator { schema: Schema; length: number = 0; allocated: number = 0; - columns: {[columnName: string]: any[]} = {}; + columns: Record> = {}; constructor(schema: Schema, options: ColumnarTableBatchOptions) { this.schema = schema; @@ -46,24 +46,11 @@ export class ColumnarTableBatchAggregator implements TableBatchAggregator { getBatch(): ColumnarTableBatch | ArrowTableBatch | null { this._pruneColumns(); - const columns = Array.isArray(this.schema) ? this.columns : {}; - - // schema is an array if there're no headers - // object if there are headers - // columns should match schema format - if (!Array.isArray(this.schema)) { - for (const fieldName in this.schema) { - const field = this.schema[fieldName]; - columns[field.name] = this.columns[field.index]; - } - } - - this.columns = {}; const batch: ColumnarTableBatch = { shape: 'columnar-table', batchType: 'data', - data: columns, + data: this.columns, schema: this.schema, length: this.length }; @@ -82,23 +69,22 @@ export class ColumnarTableBatchAggregator implements TableBatchAggregator { this.allocated = this.allocated > 0 ? (this.allocated *= 2) : DEFAULT_ROW_COUNT; this.columns = {}; - for (const fieldName in this.schema) { - const field = this.schema[fieldName]; - const ArrayType = field.type || Float32Array; - const oldColumn = this.columns[field.index]; + for (const field of this.schema.fields) { + const ArrayType = getArrayTypeFromDataType(field.type, field.nullable); + const oldColumn = this.columns[field.name]; if (oldColumn && ArrayBuffer.isView(oldColumn)) { // Copy the old data to the new array const typedArray = new ArrayType(this.allocated); typedArray.set(oldColumn); - this.columns[field.index] = typedArray; + this.columns[field.name] = typedArray; } else if (oldColumn) { // Plain array oldColumn.length = this.allocated; - this.columns[field.index] = oldColumn; + this.columns[field.name] = oldColumn; } else { // Create new - this.columns[field.index] = new ArrayType(this.allocated); + this.columns[field.name] = new ArrayType(this.allocated); } } } diff --git a/modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts index 1b1ae0efdb..5e35e95c88 100644 --- a/modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts +++ b/modules/schema-utils/src/lib/table/batch-builder/row-table-batch-aggregator.ts @@ -31,8 +31,8 @@ export class RowTableBatchAggregator implements TableBatchAggregator { // object if there are headers if (schema) { this._headers = []; - for (const key in schema) { - this._headers[schema[key].index] = schema[key].name; + for (let i = 0; i < schema.fields.length; i++) { + this._headers[i] = schema.fields[i].name; } } } diff --git a/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts b/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts index 05c6bebceb..3a35541fe3 100644 --- a/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts +++ b/modules/schema-utils/src/lib/table/tables/convert-arrow-table.ts @@ -16,6 +16,7 @@ import type { import {convertTable} from './convert-table'; import {convertArrowToSchema} from '../../schema/convert-arrow-schema'; import {makeArrowRecordBatchIterator} from '../batches/make-arrow-batch-iterator'; + /** * * Convert a loaders.gl Table to an Apache Arrow Table * @param mesh From 5d5fef6ccc661b151db46fbc0a6c4fc427cf9f4a Mon Sep 17 00:00:00 2001 From: Ib Green Date: Thu, 17 Oct 2024 11:12:16 -0400 Subject: [PATCH 6/7] fix schema handling in BatchAggregators --- modules/csv/test/csv-loader.spec.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/csv/test/csv-loader.spec.ts b/modules/csv/test/csv-loader.spec.ts index 0f433bb341..66b0c3c5bd 100644 --- a/modules/csv/test/csv-loader.spec.ts +++ b/modules/csv/test/csv-loader.spec.ts @@ -247,7 +247,6 @@ test('CSVLoader#loadInBatches(sample.csv, object-rows)', async (t) => { // `BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}` // ); t.equal(batch.length, 2, 'Got correct batch size'); - debugger t.deepEqual(batch.data[0], {column1: 'A', column2: 'B', column3: 1}, 'Got correct first row'); } batchCount++; From 8b9725943c745ac077760bb61aa2545114484e97 Mon Sep 17 00:00:00 2001 From: Ib Green Date: Thu, 17 Oct 2024 11:56:03 -0400 Subject: [PATCH 7/7] fixes --- .../columnar-table-batch-aggregator.ts | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts b/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts index c62f05e891..7ecad44927 100644 --- a/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts +++ b/modules/schema-utils/src/lib/table/batch-builder/columnar-table-batch-aggregator.ts @@ -73,18 +73,19 @@ export class ColumnarTableBatchAggregator implements TableBatchAggregator { const ArrayType = getArrayTypeFromDataType(field.type, field.nullable); const oldColumn = this.columns[field.name]; - if (oldColumn && ArrayBuffer.isView(oldColumn)) { + if (!oldColumn) { + // Create new + this.columns[field.name] = new ArrayType(this.allocated); + } else if (Array.isArray(oldColumn)) { + // Plain array, just increase its size + oldColumn.length = this.allocated; + } else if (ArrayBuffer.isView(oldColumn)) { // Copy the old data to the new array const typedArray = new ArrayType(this.allocated); - typedArray.set(oldColumn); + if (ArrayBuffer.isView(typedArray)) { + typedArray.set(oldColumn); + } this.columns[field.name] = typedArray; - } else if (oldColumn) { - // Plain array - oldColumn.length = this.allocated; - this.columns[field.name] = oldColumn; - } else { - // Create new - this.columns[field.name] = new ArrayType(this.allocated); } } }