Skip to content

Commit

Permalink
feat(shapefile): DBFArrowLoader (#3142)
Browse files Browse the repository at this point in the history
  • Loading branch information
ibgreen authored Oct 18, 2024
1 parent 4784e4b commit fbb72f5
Show file tree
Hide file tree
Showing 10 changed files with 665 additions and 3 deletions.
53 changes: 53 additions & 0 deletions modules/gis/src/lib/table-converters/make-arrow-batch-iterator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import * as arrow from 'apache-arrow';
import type {Table} from '@loaders.gl/schema';
import {
convertSchemaToArrow,
getTableLength,
getTableNumCols,
getTableCellAt
} from '@loaders.gl/schema-utils';

export function* makeTableToArrowBatchesIterator(
table: Table,
options?: {batchSize?: number}
): IterableIterator<arrow.RecordBatch> {
const arrowSchema = convertSchemaToArrow(table.schema!);

const length = getTableLength(table);
const numColumns = getTableNumCols(table);
const batchSize = options?.batchSize || length;

const builders = arrowSchema?.fields.map((arrowField) => arrow.makeBuilder(arrowField));
const structField = new arrow.Struct(arrowSchema.fields);

let batchLength = 0;
for (let rowIndex = 0; rowIndex < length; rowIndex++) {
for (let columnIndex = 0; columnIndex < numColumns; ++columnIndex) {
const value = getTableCellAt(table, rowIndex, columnIndex);

const builder = builders[columnIndex];
builder.append(value);
batchLength++;

if (batchLength >= batchSize) {
const datas = builders.map((builder) => builder.flush());
const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas);
yield new arrow.RecordBatch(arrowSchema, structData);
batchLength = 0;
}
}
}

if (batchLength > 0) {
const datas = builders.map((builder) => builder.flush());
const structData = new arrow.Data(structField, 0, batchLength, 0, undefined, datas);
yield new arrow.RecordBatch(arrowSchema, structData);
batchLength = 0;
}

builders.map((builder) => builder.finish());
}
4 changes: 2 additions & 2 deletions modules/json/test/lib/clarinet/clarinet.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ test('clarinet#generic', (t) => {
// /\t|\n|\r| / means on whitespace
// '' means on every char
for (const sep in seps) {
t.comment('[' + key + '] should be able to parse -> ' + sep);
// t.comment('[' + key + '] should be able to parse -> ' + sep);
generic(t, key, false, sep);
}
}
Expand All @@ -820,7 +820,7 @@ test('#pre-chunked', (t) => {
continue;
}

t.comment('[' + key + '] should be able to parse pre-chunked');
// t.comment('[' + key + '] should be able to parse pre-chunked');
generic(t, key, true);
}
}
Expand Down
3 changes: 2 additions & 1 deletion modules/schema-utils/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ export {
} from './lib/table/arrow-api/index';

// EXPERIMENTAL APIs
export {ArrowTableBuilder} from './lib/table/batch-builder/arrow-table-builder';

// SCHEMA UTILS
// Schema utils
export {getTypeInfo} from './lib/table/arrow-api/get-type-info';

export {default as AsyncQueue} from './lib/utils/async-queue';
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import type {Schema, ArrowTable, ArrowTableBatch} from '@loaders.gl/schema';
import * as arrow from 'apache-arrow';
import {convertSchemaToArrow} from '@loaders.gl/schema-utils';

/** Builds an arrow table or batches */
export class ArrowTableBuilder {
schema: Schema;
arrowSchema: arrow.Schema;
arrowBuilders: arrow.Builder[];
length: number;

constructor(schema: Schema) {
this.schema = schema;
this.arrowSchema = convertSchemaToArrow(schema);
this.arrowBuilders = this.arrowSchema.fields.map((field) =>
arrow.makeBuilder({type: field.type, nullValues: [null]})
);
this.length = 0;
}

addObjectRow(row: {[key: string]: any}) {
for (let i = 0; i < this.arrowBuilders.length; i++) {
const columnName = this.schema.fields[i].name;
const value = row[columnName];
// if (this.schema.fields[i].type.toString() === 'bool') {
// debugger;
// }
this.arrowBuilders[i].append(value);
}
this.length++;
}

addArrayRow(row: any[]) {
for (let i = 0; i < this.arrowBuilders.length; i++) {
this.arrowBuilders[i].append(row[i]);
}
this.length++;
}

/** Makes sure that a first batch with schema is sent even if no rows */
firstBatch(): ArrowTableBatch | null {
const arrowRecordBatch = this._getArrowRecordBatch();
// If there is data, a batch will be sent later
if (arrowRecordBatch.numCols !== 0) {
return null;
}
return {
shape: 'arrow-table',
batchType: 'data',
length: arrowRecordBatch.numRows,
schema: this.schema,
data: new arrow.Table(arrowRecordBatch)
};
}

/** Flush the current batch if conditions are right */
flushBatch(): ArrowTableBatch | null {
const arrowRecordBatch = this._getArrowRecordBatch();
if (arrowRecordBatch.numCols === 0) {
return null;
}
return {
shape: 'arrow-table',
batchType: 'data',
length: arrowRecordBatch.numRows,
schema: this.schema,
data: new arrow.Table(arrowRecordBatch)
};
}

/** Get a last batch if any data is left */
finishBatch(): ArrowTableBatch | null {
const arrowRecordBatch = this._getArrowRecordBatch();
this.arrowBuilders.forEach((builder) => builder.finish());
if (arrowRecordBatch.numCols === 0) {
return null;
}
return {
shape: 'arrow-table',
batchType: 'data',
length: arrowRecordBatch.numRows,
schema: this.schema,
data: new arrow.Table(arrowRecordBatch)
};
}

/** Return a table with all the accumulated data */
finishTable(): ArrowTable {
const arrowRecordBatch = this._getArrowRecordBatch();
this.arrowBuilders.forEach((builder) => builder.finish());
return {
shape: 'arrow-table',
schema: this.schema,
data: new arrow.Table(arrowRecordBatch)
};
}

/** Extract a record batch flushing the currently accumulated data in the builders */
_getArrowRecordBatch(): arrow.RecordBatch {
const {arrowBuilders, arrowSchema} = this;
const arrowDatas = arrowBuilders.map((builder) => builder.flush());
const length = arrowDatas[0].length;
const structField = new arrow.Struct(arrowSchema.fields);
const arrowStructData = new arrow.Data(structField, 0, length, 0, undefined, arrowDatas);
const arrowRecordBatch = new arrow.RecordBatch(arrowSchema, arrowStructData);
return arrowRecordBatch;
}
}
46 changes: 46 additions & 0 deletions modules/shapefile/src/dbf-arrow-loader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import type {Loader, LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils';
import type {ArrowTable, ArrowTableBatch} from '@loaders.gl/schema';
import {parseDBF, parseDBFInBatches} from './lib/parsers/parse-dbf-to-arrow';
import {DBFFormat} from './dbf-format';

// __VERSION__ is injected by babel-plugin-version-inline
// @ts-ignore TS2304: Cannot find name '__VERSION__'.
const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest';

export type DBFLoaderOptions = LoaderOptions & {
dbf?: {
encoding?: string;
/** Override the URL to the worker bundle (by default loads from unpkg.com) */
workerUrl?: string;
};
};

/**
* DBFLoader - DBF files are used to contain non-geometry columns in Shapefiles
*/
export const DBFArrowWorkerLoader = {
...DBFFormat,
dataType: null as unknown as ArrowTable,
batchType: null as unknown as ArrowTableBatch,
version: VERSION,
worker: true,
options: {
dbf: {
encoding: 'latin1'
}
}
} as const satisfies Loader<ArrowTable, ArrowTableBatch, DBFLoaderOptions>;

/** DBF file loader */
export const DBFArrowLoader = {
...DBFArrowWorkerLoader,
parse: async (arrayBuffer, options) => parseDBF(arrayBuffer, options),
parseSync: parseDBF,
parseInBatches(arrayBufferIterator: AsyncIterable<ArrayBuffer> | Iterable<ArrayBuffer>, options) {
return parseDBFInBatches(arrayBufferIterator, options);
}
} as const satisfies LoaderWithParser<ArrowTable, ArrowTableBatch, DBFLoaderOptions>;
15 changes: 15 additions & 0 deletions modules/shapefile/src/dbf-format.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import type {Format} from '@loaders.gl/loader-utils';

/** Information about the DBF format */
export const DBFFormat = {
name: 'DBF',
id: 'dbf',
module: 'shapefile',
category: 'table',
extensions: ['dbf'],
mimeTypes: ['application/x-dbf']
} as const satisfies Format;
1 change: 1 addition & 0 deletions modules/shapefile/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export {ShapefileLoader} from './shapefile-loader';

export type {DBFLoaderOptions} from './dbf-loader';
export {DBFLoader, DBFWorkerLoader} from './dbf-loader';
export {DBFArrowLoader, DBFArrowWorkerLoader} from './dbf-arrow-loader';

export type {SHPLoaderOptions} from './shp-loader';
export {SHPLoader, SHPWorkerLoader} from './shp-loader';
Expand Down
Loading

0 comments on commit fbb72f5

Please sign in to comment.