Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(shapefile): DBFArrowLoader #3142

Merged
merged 4 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions modules/gis/src/lib/table-converters/make-arrow-batch-iterator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import * as arrow from 'apache-arrow';
import type {Table} from '@loaders.gl/schema';
import {
convertSchemaToArrow,
getTableLength,
getTableNumCols,
getTableCellAt
} from '@loaders.gl/schema-utils';

export function* makeTableToArrowBatchesIterator(
table: Table,
options?: {batchSize?: number}
): IterableIterator<arrow.RecordBatch> {
const arrowSchema = convertSchemaToArrow(table.schema!);

const length = getTableLength(table);
const numColumns = getTableNumCols(table);
const batchSize = options?.batchSize || length;

const builders = arrowSchema?.fields.map((arrowField) => arrow.makeBuilder(arrowField));
const structField = new arrow.Struct(arrowSchema.fields);

let batchLength = 0;
for (let rowIndex = 0; rowIndex < length; rowIndex++) {
for (let columnIndex = 0; columnIndex < numColumns; ++columnIndex) {
const value = getTableCellAt(table, rowIndex, columnIndex);

const builder = builders[columnIndex];
builder.append(value);
batchLength++;

if (batchLength >= batchSize) {
const datas = builders.map((builder) => builder.flush());
const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas);
yield new arrow.RecordBatch(arrowSchema, structData);
batchLength = 0;
}
}
}

if (batchLength > 0) {
const datas = builders.map((builder) => builder.flush());
const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas);
yield new arrow.RecordBatch(arrowSchema, structData);
batchLength = 0;
}

builders.map((builder) => builder.finish());
}
4 changes: 2 additions & 2 deletions modules/json/test/lib/clarinet/clarinet.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ test('clarinet#generic', (t) => {
// /\t|\n|\r| / means on whitespace
// '' means on every char
for (const sep in seps) {
t.comment('[' + key + '] should be able to parse -> ' + sep);
// t.comment('[' + key + '] should be able to parse -> ' + sep);
generic(t, key, false, sep);
}
}
Expand All @@ -820,7 +820,7 @@ test('#pre-chunked', (t) => {
continue;
}

t.comment('[' + key + '] should be able to parse pre-chunked');
// t.comment('[' + key + '] should be able to parse pre-chunked');
generic(t, key, true);
}
}
Expand Down
3 changes: 2 additions & 1 deletion modules/schema-utils/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ export {
} from './lib/table/arrow-api/index';

// EXPERIMENTAL APIs
export {ArrowTableBuilder} from './lib/table/batch-builder/arrow-table-builder';

// SCHEMA UTILS
// Schema utils
export {getTypeInfo} from './lib/table/arrow-api/get-type-info';

export {default as AsyncQueue} from './lib/utils/async-queue';
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import type {Schema, ArrowTable, ArrowTableBatch} from '@loaders.gl/schema';
import * as arrow from 'apache-arrow';
import {convertSchemaToArrow} from '@loaders.gl/schema-utils';

/** Builds an arrow table or batches */
export class ArrowTableBuilder {
schema: Schema;
arrowSchema: arrow.Schema;
arrowBuilders: arrow.Builder[];
length: number;

constructor(schema: Schema) {
this.schema = schema;
this.arrowSchema = convertSchemaToArrow(schema);
this.arrowBuilders = this.arrowSchema.fields.map((field) =>
arrow.makeBuilder({type: field.type, nullValues: [null]})
);
this.length = 0;
}

addObjectRow(row: {[key: string]: any}) {
for (let i = 0; i < this.arrowBuilders.length; i++) {
const columnName = this.schema.fields[i].name;
const value = row[columnName];
// if (this.schema.fields[i].type.toString() === 'bool') {
// debugger;
// }
this.arrowBuilders[i].append(value);
}
this.length++;
}

addArrayRow(row: any[]) {
for (let i = 0; i < this.arrowBuilders.length; i++) {
this.arrowBuilders[i].append(row[i]);
}
this.length++;
}

/** Makes sure that a first batch with schema is sent even if no rows */
firstBatch(): ArrowTableBatch | null {
const arrowRecordBatch = this._getArrowRecordBatch();
// If there is data, a batch will be sent later
if (arrowRecordBatch.numCols !== 0) {
return null;
}
return {
shape: 'arrow-table',
batchType: 'data',
length: arrowRecordBatch.numRows,
schema: this.schema,
data: new arrow.Table(arrowRecordBatch)
};
}

/** Flush the current batch if conditions are right */
flushBatch(): ArrowTableBatch | null {
const arrowRecordBatch = this._getArrowRecordBatch();
if (arrowRecordBatch.numCols === 0) {
return null;
}
return {
shape: 'arrow-table',
batchType: 'data',
length: arrowRecordBatch.numRows,
schema: this.schema,
data: new arrow.Table(arrowRecordBatch)
};
}

/** Get a last batch if any data is left */
finishBatch(): ArrowTableBatch | null {
const arrowRecordBatch = this._getArrowRecordBatch();
this.arrowBuilders.forEach((builder) => builder.finish());
if (arrowRecordBatch.numCols === 0) {
return null;
}
return {
shape: 'arrow-table',
batchType: 'data',
length: arrowRecordBatch.numRows,
schema: this.schema,
data: new arrow.Table(arrowRecordBatch)
};
}

/** Return a table with all the accumulated data */
finishTable(): ArrowTable {
const arrowRecordBatch = this._getArrowRecordBatch();
this.arrowBuilders.forEach((builder) => builder.finish());
return {
shape: 'arrow-table',
schema: this.schema,
data: new arrow.Table(arrowRecordBatch)
};
}

/** Extract a record batch flushing the currently accumulated data in the builders */
_getArrowRecordBatch(): arrow.RecordBatch {
const {arrowBuilders, arrowSchema} = this;
const arrowDatas = arrowBuilders.map((builder) => builder.flush());
const length = arrowDatas[0].length;
const structField = new arrow.Struct(arrowSchema.fields);
const arrowStructData = new arrow.Data(structField, 0, length, 0, undefined, arrowDatas);
const arrowRecordBatch = new arrow.RecordBatch(arrowSchema, arrowStructData);
return arrowRecordBatch;
}
}
46 changes: 46 additions & 0 deletions modules/shapefile/src/dbf-arrow-loader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import type {Loader, LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils';
import type {ArrowTable, ArrowTableBatch} from '@loaders.gl/schema';
import {parseDBF, parseDBFInBatches} from './lib/parsers/parse-dbf-to-arrow';
import {DBFFormat} from './dbf-format';

// __VERSION__ is injected by babel-plugin-version-inline
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest';

export type DBFLoaderOptions = LoaderOptions & {
dbf?: {
encoding?: string;
/** Override the URL to the worker bundle (by default loads from unpkg.com) */
workerUrl?: string;
};
};

/**
* DBFLoader - DBF files are used to contain non-geometry columns in Shapefiles
*/
export const DBFArrowWorkerLoader = {
...DBFFormat,
dataType: null as unknown as ArrowTable,
batchType: null as unknown as ArrowTableBatch,
version: VERSION,
worker: true,
options: {
dbf: {
encoding: 'latin1'
}
}
} as const satisfies Loader<ArrowTable, ArrowTableBatch, DBFLoaderOptions>;

/** DBF file loader */
export const DBFArrowLoader = {
...DBFArrowWorkerLoader,
parse: async (arrayBuffer, options) => parseDBF(arrayBuffer, options),
parseSync: parseDBF,
parseInBatches(arrayBufferIterator: AsyncIterable<ArrayBuffer> | Iterable<ArrayBuffer>, options) {
return parseDBFInBatches(arrayBufferIterator, options);
}
} as const satisfies LoaderWithParser<ArrowTable, ArrowTableBatch, DBFLoaderOptions>;
15 changes: 15 additions & 0 deletions modules/shapefile/src/dbf-format.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import type {Format} from '@loaders.gl/loader-utils';

/** Information about the DBF format */
export const DBFFormat = {
name: 'DBF',
id: 'dbf',
module: 'shapefile',
category: 'table',
extensions: ['dbf'],
mimeTypes: ['application/x-dbf']
} as const satisfies Format;
1 change: 1 addition & 0 deletions modules/shapefile/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export {ShapefileLoader} from './shapefile-loader';

export type {DBFLoaderOptions} from './dbf-loader';
export {DBFLoader, DBFWorkerLoader} from './dbf-loader';
export {DBFArrowLoader, DBFArrowWorkerLoader} from './dbf-arrow-loader';

export type {SHPLoaderOptions} from './shp-loader';
export {SHPLoader, SHPWorkerLoader} from './shp-loader';
Expand Down
Loading
Loading