From 6b7bea90b462de868ec8fa198dba36c8865b0400 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat <61435908+saraswatpuneet@users.noreply.github.com> Date: Fri, 13 Sep 2024 06:16:31 -0700 Subject: [PATCH] Logical type : TIME (#143) # Problem Part of #99 ![image](https://github.com/user-attachments/assets/3526d69a-b89b-4513-b02c-39ff03e91af3) Support logical types in parquetjs starting with `TIME` support Solution ======== Implementation following the parquet [spec](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype) ## Change summary: - Added a logical type to support parquet types - Implemented type conversions for logical type time to primitive type according to spec - Implement field, schema and file tests to ensure everything works together ## Steps to Verify: 1. npm run test --- lib/codec/types.ts | 3 +- lib/declare.ts | 4 +- lib/fields.ts | 24 ++++- lib/jsonSchema.ts | 69 +++++++++++- lib/schema.ts | 2 + lib/types.ts | 72 ++++++++++++- test/decodeSchema.js | 8 ++ test/fields.test.ts | 65 +++++++++++ test/jsonSchema.test.ts | 89 ++++++++++++++- .../json-schema-test-file.result.json | 14 ++- .../json-schema-test-file.schema.result.json | 94 ++++++++++++++++ test/test-files/time.schema.json | 27 +++++ test/test-files/time.schema.result.json | 102 ++++++++++++++++++ test/test-files/time.schema_micros.json | 30 ++++++ .../test-files/time.schema_micros.result.json | 102 ++++++++++++++++++ test/test-files/time.schema_millis.json | 30 ++++++ .../test-files/time.schema_millis.result.json | 102 ++++++++++++++++++ test/test-files/time.schema_nanos.json | 30 ++++++ test/test-files/time.schema_nanos.result.json | 100 +++++++++++++++++ 19 files changed, 956 insertions(+), 11 deletions(-) create mode 100644 test/test-files/time.schema.json create mode 100644 test/test-files/time.schema.result.json create mode 100644 test/test-files/time.schema_micros.json create mode 100644 test/test-files/time.schema_micros.result.json create mode 100644 test/test-files/time.schema_millis.json create mode 100644 test/test-files/time.schema_millis.result.json create mode 100644 test/test-files/time.schema_nanos.json create mode 100644 test/test-files/time.schema_nanos.result.json diff --git a/lib/codec/types.ts b/lib/codec/types.ts index 7334fd4d..49ff5667 100644 --- a/lib/codec/types.ts +++ b/lib/codec/types.ts @@ -1,6 +1,6 @@ import { PrimitiveType } from '../declare'; import { ParquetCodec, OriginalType, ParquetField } from '../declare'; -import { Statistics } from '../../gen-nodejs/parquet_types'; +import { LogicalType, Statistics } from '../../gen-nodejs/parquet_types'; export interface Options { typeLength: number; @@ -8,6 +8,7 @@ export interface Options { disableEnvelope?: boolean; primitiveType?: PrimitiveType; originalType?: OriginalType; + logicalType?: LogicalType; encoding?: ParquetCodec; compression?: string; column?: ParquetField; diff --git a/lib/declare.ts b/lib/declare.ts index 049813e9..ca347737 100644 --- a/lib/declare.ts +++ b/lib/declare.ts @@ -1,6 +1,6 @@ // Thanks to https://github.com/kbajalc/parquets -import parquet_thrift from '../gen-nodejs/parquet_types'; +import parquet_thrift, { LogicalType } from '../gen-nodejs/parquet_types'; import { Statistics, OffsetIndex, @@ -61,6 +61,7 @@ export type SchemaDefinition = Record; export interface FieldDefinition { type?: ParquetType; typeLength?: number; + logicalType?: LogicalType; encoding?: ParquetCodec; compression?: ParquetCompression; optional?: boolean; @@ -80,6 +81,7 @@ export interface ParquetField { primitiveType?: PrimitiveType; originalType?: OriginalType; repetitionType: RepetitionType; + logicalType?: LogicalType; typeLength?: number; encoding?: ParquetCodec; compression?: ParquetCompression; diff --git a/lib/fields.ts b/lib/fields.ts index cc54cb1e..b80cc3c6 100644 --- a/lib/fields.ts +++ b/lib/fields.ts @@ -1,6 +1,7 @@ // Helper functions for creating fields -import { FieldDefinition, ParquetType, SchemaDefinition } from './declare'; +import { LogicalType, TimeType } from '../gen-nodejs/parquet_types'; +import { FieldDefinition, ParquetType, PrimitiveType, SchemaDefinition } from './declare'; export function createStringField(optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition { return { ...fieldOptions, optional, type: 'UTF8' }; @@ -80,3 +81,24 @@ export function createListField( }, }; } + +export function createTimeField( + logicalType: TimeType, + optional = true, + fieldOptions: FieldDefinition = {} +): FieldDefinition { + let primitiveType: PrimitiveType; + if (logicalType.unit.MILLIS) { + primitiveType = 'INT32'; // TIME_MILLIS uses INT32 + } else if (logicalType.unit.MICROS || logicalType.unit.NANOS) { + primitiveType = 'INT64'; // TIME_MICROS and TIME_NANOS use INT64 + } else { + throw new Error('Unsupported time unit in logicalType'); + } + return { + ...fieldOptions, + optional, + type: primitiveType, + logicalType: new LogicalType({ TIME: logicalType }), + }; +} diff --git a/lib/jsonSchema.ts b/lib/jsonSchema.ts index 5466e1c3..f84c8098 100644 --- a/lib/jsonSchema.ts +++ b/lib/jsonSchema.ts @@ -2,6 +2,8 @@ import { JSONSchema4 } from 'json-schema'; import { FieldDefinition, SchemaDefinition } from './declare'; import * as fields from './fields'; +import { TimeUnit } from '../gen-nodejs/parquet_types'; +import { TimeType } from '../gen-nodejs/parquet_types'; type SupportedJSONSchema4 = Omit< JSONSchema4, @@ -70,18 +72,52 @@ const fromJsonSchemaArray = (fieldValue: SupportedJSONSchema4, optionalFieldList switch (fieldValue.items.type) { case 'string': - if (fieldValue.items.format && fieldValue.items.format == 'date-time') { + if (fieldValue.items.format && fieldValue.items.format === 'date-time') { return fields.createListField('TIMESTAMP_MILLIS', optionalFieldList); } return fields.createListField('UTF8', optionalFieldList); + case 'integer': return fields.createListField('INT64', optionalFieldList); + case 'number': return fields.createListField('DOUBLE', optionalFieldList); + case 'boolean': return fields.createListField('BOOLEAN', optionalFieldList); + case 'object': + // Handle array of time fields + if ( + fieldValue.items.properties && + fieldValue.items.properties.unit && + fieldValue.items.properties.isAdjustedToUTC + ) { + if (!fieldValue.items.properties.unit.enum) { + throw new UnsupportedJsonSchemaError('Unit enum is not defined'); + } + const unit = fieldValue.items.properties.unit.default || fieldValue.items.properties.unit.enum[0]; + const isAdjustedToUTC = !!fieldValue.items.properties.isAdjustedToUTC.default; + let timeUnit: TimeUnit; + + switch (unit) { + case 'MICROS': + timeUnit = new TimeUnit({ MICROS: true }); + break; + case 'NANOS': + timeUnit = new TimeUnit({ NANOS: true }); + break; + default: + timeUnit = new TimeUnit({ MILLIS: true }); + break; + } + + const timeLogicalType = new TimeType({ isAdjustedToUTC, unit: timeUnit }); + return fields.createTimeField(timeLogicalType, optionalFieldList); + } + return fields.createStructListField(fromJsonSchema(fieldValue.items), optionalFieldList); + default: throw new UnsupportedJsonSchemaError(`Array field type ${JSON.stringify(fieldValue.items)} is unsupported.`); } @@ -100,20 +136,49 @@ const fromJsonSchemaField = switch (fieldValue.type) { case 'string': - if (fieldValue.format && fieldValue.format == 'date-time') { + if (fieldValue.format && fieldValue.format === 'date-time') { return fields.createTimestampField(optional); } return fields.createStringField(optional); + case 'integer': return fields.createIntField(64, optional); + case 'number': return fields.createDoubleField(optional); + case 'boolean': return fields.createBooleanField(optional); + case 'array': return fromJsonSchemaArray(fieldValue, optional); + case 'object': + if (fieldValue.properties && fieldValue.properties.unit && fieldValue.properties.isAdjustedToUTC) { + if (!fieldValue.properties.unit.enum) { + throw new UnsupportedJsonSchemaError('Unit enum is not defined'); + } + const unit = fieldValue.properties.unit.default || fieldValue.properties.unit.enum[0]; + const isAdjustedToUTC = !!fieldValue.properties.isAdjustedToUTC.default; + let timeUnit: TimeUnit; + switch (unit) { + case 'MICROS': + timeUnit = new TimeUnit({ MICROS: true }); + break; + case 'NANOS': + timeUnit = new TimeUnit({ NANOS: true }); + break; + default: + timeUnit = new TimeUnit({ MILLIS: true }); + break; + } + + const timeLogicalType = new TimeType({ isAdjustedToUTC, unit: timeUnit }); + return fields.createTimeField(timeLogicalType, optional); + } + return fields.createStructField(fromJsonSchema(fieldValue), optional); + default: throw new UnsupportedJsonSchemaError( `Unable to convert "${fieldName}" with JSON Schema type "${fieldValue.type}" to a Parquet Schema.` diff --git a/lib/schema.ts b/lib/schema.ts index f33165dd..38e5adc6 100644 --- a/lib/schema.ts +++ b/lib/schema.ts @@ -126,6 +126,7 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP statistics: opts.statistics, fieldCount: Object.keys(opts.fields).length, fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name)), + logicalType: opts.logicalType, }; if (opts.type == 'LIST' || opts.type == 'MAP') fieldList[name].originalType = opts.type; @@ -174,6 +175,7 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP name: name, primitiveType: typeDef.primitiveType, originalType: typeDef.originalType, + logicalType: opts.logicalType, path: path.concat([name]), repetitionType: repetitionType, encoding: opts.encoding, diff --git a/lib/types.ts b/lib/types.ts index a2bb9a98..ab8f4a62 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -21,23 +21,28 @@ interface INTERVAL { milliseconds: number; } +interface TIME { + value: string | bigint | number; + unit: 'MILLIS' | 'MICROS' | 'NANOS'; + isAdjustedToUTC: boolean; +} + export function getParquetTypeDataObject( type: ParquetType, field?: ParquetField | Options | FieldDefinition ): ParquetTypeDataObject { if (type === 'DECIMAL') { - if (field?.typeLength !== undefined && field?.typeLength !== null) { + if (field?.typeLength !== undefined) { return { primitiveType: 'FIXED_LEN_BYTE_ARRAY', originalType: 'DECIMAL', typeLength: field.typeLength, toPrimitive: toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL, }; - } else if (field?.precision !== undefined && field?.precision !== null && field.precision > 18) { + } else if (field?.precision && field.precision > 18) { return { primitiveType: 'BYTE_ARRAY', originalType: 'DECIMAL', - typeLength: field.typeLength, toPrimitive: toPrimitive_BYTE_ARRAY_DECIMAL, }; } else { @@ -47,6 +52,29 @@ export function getParquetTypeDataObject( toPrimitive: toPrimitive_INT64, }; } + } else if (field?.logicalType?.TIME) { + const unit = field.logicalType.TIME.unit; + if (unit.MILLIS) { + return { + originalType: 'TIME_MILLIS', + primitiveType: 'INT32', + toPrimitive: toPrimitive_TIME, + }; + } + if (unit.MICROS) { + return { + originalType: 'TIME_MICROS', + primitiveType: 'INT64', + toPrimitive: toPrimitive_TIME, + }; + } + if (unit.NANOS) { + return { + primitiveType: 'INT64', + toPrimitive: toPrimitive_TIME, + }; + } + throw new Error('TIME type must have a valid unit (MILLIS, MICROS, NANOS).'); } else { return PARQUET_LOGICAL_TYPE_DATA[type]; } @@ -560,3 +588,41 @@ function checkValidValue(lowerRange: number | bigint, upperRange: number | bigin throw 'invalid value'; } } + +function toPrimitive_TIME(time: TIME): bigint | number { + const { value, unit, isAdjustedToUTC } = time; + + const timeValue = typeof value === 'string' ? BigInt(value) : BigInt(value); + + if (isAdjustedToUTC) { + return unit === 'MILLIS' ? Number(timeValue) : timeValue; + } else { + switch (unit) { + case 'MILLIS': + return Number(adjustToLocalTimestamp(timeValue, { MILLIS: true })); + case 'MICROS': + return adjustToLocalTimestamp(timeValue, { MICROS: true }); + case 'NANOS': + return adjustToLocalTimestamp(timeValue, { NANOS: true }); + default: + throw new Error(`Unsupported time unit: ${unit}`); + } + } +} + +function adjustToLocalTimestamp( + timestamp: bigint, + unit: { MILLIS?: boolean; MICROS?: boolean; NANOS?: boolean } +): bigint { + const localOffset = BigInt(new Date().getTimezoneOffset()) * 60n * 1000n; // Offset in milliseconds + + if (unit.MILLIS) { + return timestamp - localOffset; + } else if (unit.MICROS) { + return timestamp - localOffset * 1000n; + } else if (unit.NANOS) { + return timestamp - localOffset * 1000000n; + } + + throw new Error('Unsupported time unit'); +} diff --git a/test/decodeSchema.js b/test/decodeSchema.js index fecb9ec0..4eb0b823 100644 --- a/test/decodeSchema.js +++ b/test/decodeSchema.js @@ -120,6 +120,7 @@ describe('ParquetSchema', function () { dLevelMax: 0, isNested: true, fieldCount: 2, + logicalType: undefined, fields: { b: { name: 'b', @@ -130,6 +131,7 @@ describe('ParquetSchema', function () { dLevelMax: 0, isNested: true, fieldCount: 2, + logicalType: undefined, fields: { c: { name: 'c', @@ -140,6 +142,7 @@ describe('ParquetSchema', function () { dLevelMax: 0, isNested: true, fieldCount: 1, + logicalType: undefined, fields: { d: { name: 'd', @@ -150,6 +153,7 @@ describe('ParquetSchema', function () { statistics: undefined, typeLength: undefined, encoding: 'PLAIN', + logicalType: undefined, compression: 'UNCOMPRESSED', rLevelMax: 0, dLevelMax: 0, @@ -167,6 +171,7 @@ describe('ParquetSchema', function () { dLevelMax: 0, isNested: true, fieldCount: 2, + logicalType: undefined, fields: { f: { name: 'f', @@ -177,6 +182,7 @@ describe('ParquetSchema', function () { statistics: undefined, typeLength: undefined, encoding: 'PLAIN', + logicalType: undefined, compression: 'UNCOMPRESSED', rLevelMax: 0, dLevelMax: 0, @@ -192,6 +198,7 @@ describe('ParquetSchema', function () { statistics: undefined, typeLength: undefined, encoding: 'PLAIN', + logicalType: undefined, compression: 'UNCOMPRESSED', rLevelMax: 0, dLevelMax: 0, @@ -211,6 +218,7 @@ describe('ParquetSchema', function () { statistics: undefined, typeLength: undefined, encoding: 'PLAIN', + logicalType: undefined, compression: 'UNCOMPRESSED', rLevelMax: 0, dLevelMax: 0, diff --git a/test/fields.test.ts b/test/fields.test.ts index dc80a53a..4760f715 100644 --- a/test/fields.test.ts +++ b/test/fields.test.ts @@ -1,6 +1,7 @@ import { assert } from 'chai'; import { ParquetSchema } from '../parquet'; import * as fields from '../lib/fields'; +import { MicroSeconds, MilliSeconds, NanoSeconds, TimeType, TimeUnit } from '../gen-nodejs/parquet_types'; describe('Field Builders: Primitive Types', function () { it('Can use primitive field types: String', function () { @@ -209,6 +210,70 @@ describe('Field Builders: Structs and Struct List', function () { assert.equal(!!c.isNested, true); assert.equal(c.fieldCount, 1); }); + + it('Can use primitive field types: Time with default MILLIS', function () { + const schema = new ParquetSchema({ + timeField: fields.createTimeField( + new TimeType({ isAdjustedToUTC: true, unit: new TimeUnit({ MILLIS: new MilliSeconds() }) }), + true + ), + }); + const c = schema.fields.timeField; + assert.equal(c.name, 'timeField'); + assert.equal(c.primitiveType, 'INT32'); + assert.equal(c.originalType, 'TIME_MILLIS'); + assert.deepEqual(c.path, ['timeField']); + assert.equal(c.repetitionType, 'OPTIONAL'); + assert.equal(c.encoding, 'PLAIN'); + assert.equal(c.compression, 'UNCOMPRESSED'); + assert.equal(c.rLevelMax, 0); + assert.equal(c.dLevelMax, 1); + assert.equal(!!c.isNested, false); + assert.equal(c.fieldCount, undefined); + }); + + it('Can use primitive field types: Time with MICROS', function () { + const schema = new ParquetSchema({ + timeField: fields.createTimeField( + new TimeType({ isAdjustedToUTC: false, unit: new TimeUnit({ MICROS: new MicroSeconds() }) }), + true + ), + }); + const c = schema.fields.timeField; + assert.equal(c.name, 'timeField'); + assert.equal(c.primitiveType, 'INT64'); + assert.equal(c.originalType, 'TIME_MICROS'); + assert.deepEqual(c.path, ['timeField']); + assert.equal(c.repetitionType, 'OPTIONAL'); + assert.equal(c.encoding, 'PLAIN'); + assert.equal(c.compression, 'UNCOMPRESSED'); + assert.equal(c.rLevelMax, 0); + assert.equal(c.dLevelMax, 1); + assert.equal(!!c.isNested, false); + assert.equal(c.fieldCount, undefined); + }); + + it('Can use primitive field types: Time with NANOS', function () { + const schema = new ParquetSchema({ + timeField: fields.createTimeField( + new TimeType({ isAdjustedToUTC: true, unit: new TimeUnit({ NANOS: new NanoSeconds() }) }), + true, + { compression: 'GZIP' } + ), + }); + const c = schema.fields.timeField; + assert.equal(c.name, 'timeField'); + assert.equal(c.primitiveType, 'INT64'); + assert.equal(c.originalType, undefined); + assert.equal(c.compression, 'GZIP'); + assert.deepEqual(c.path, ['timeField']); + assert.equal(c.repetitionType, 'OPTIONAL'); + assert.equal(c.encoding, 'PLAIN'); + assert.equal(c.rLevelMax, 0); + assert.equal(c.dLevelMax, 1); + assert.equal(!!c.isNested, false); + assert.equal(c.fieldCount, undefined); + }); }); describe('Field Builders: Lists', function () { diff --git a/test/jsonSchema.test.ts b/test/jsonSchema.test.ts index d0f29a0a..e7ef4251 100644 --- a/test/jsonSchema.test.ts +++ b/test/jsonSchema.test.ts @@ -6,7 +6,10 @@ import addressSchema from './test-files/address.schema.json'; import arraySchema from './test-files/array.schema.json'; import objectSchema from './test-files/object.schema.json'; import objectNestedSchema from './test-files/object-nested.schema.json'; - +import timeSchema from './test-files/time.schema.json'; +import timeSchemaMillis from './test-files/time.schema_millis.json'; +import timeSchemaMicros from './test-files/time.schema_micros.json'; +import timeSchemaNanos from './test-files/time.schema_nanos.json'; import { ParquetSchema, ParquetWriter, ParquetReader } from '../parquet'; const update = false; @@ -52,6 +55,30 @@ describe('Json Schema Conversion', function () { const ps = ParquetSchema.fromJsonSchema(js); checkSnapshot(ps, './test-files/object-nested.schema.result.json', update); }); + + it('Time Schema Generic', function () { + const js = timeSchema as JSONSchema4; + const ps = ParquetSchema.fromJsonSchema(js); + checkSnapshot(ps, './test-files/time.schema.result.json', update); + }); + + it('Time Schema MILLIS', function () { + const js = timeSchemaMillis as JSONSchema4; + const ps = ParquetSchema.fromJsonSchema(js); + checkSnapshot(ps, './test-files/time.schema_millis.result.json', update); + }); + + it('Time Schema MICROS', function () { + const js = timeSchemaMicros as JSONSchema4; + const ps = ParquetSchema.fromJsonSchema(js); + checkSnapshot(ps, './test-files/time.schema_micros.result.json', update); + }); + + it('Time Schema NANOS', function () { + const js = timeSchemaNanos as JSONSchema4; + const ps = ParquetSchema.fromJsonSchema(js); + checkSnapshot(ps, './test-files/time.schema_nanos.result.json', update); + }); }); const parquetSchema = ParquetSchema.fromJsonSchema({ @@ -113,6 +140,22 @@ const parquetSchema = ParquetSchema.fromJsonSchema({ }, additionalItems: false, }, + time_field: { + type: 'object', + properties: { + value: { + type: 'number', + }, + unit: { + type: 'string', + enum: ['MILLIS', 'MICROS', 'NANOS'], // Define enum for time units + }, + isAdjustedToUTC: { + type: 'boolean', + }, + }, + additionalProperties: false, + }, }, additionalProperties: false, }); @@ -152,6 +195,48 @@ describe('Json Schema Conversion Test File', function () { }, ], }, + time_field: { + value: 1726067527, + unit: 'MILLIS', + isAdjustedToUTC: true, + }, + }; + + const row1FromParquetFile = { + string_field: 'string value', + int_field: 10n, + number_field: 2.5, + timestamp_array_field: { list: [{ element: new Date('2023-01-01 GMT') }] }, + + timestamp_field: new Date('2023-01-01 GMT'), + + array_field: { + list: [{ element: 'array_field val1' }, { element: 'array_field val2' }], + }, + + obj_field: { + sub1: 'obj_field_sub1 val', + sub2: 'obj_field_sub2 val', + }, + + struct_field: { + list: [ + { + element: { + sub8: { + list: [{ element: 'val1' }, { element: 'val2' }], + }, + sub3: 'struct_field_string val', + sub4: 'struct_field_string val', + sub5: { + sub6: 'struct_field_struct_string1 val', + sub7: 'struct_field_struct_string2 val', + }, + }, + }, + ], + }, + time_field: 1726067527, }; let reader: ParquetReader; @@ -178,7 +263,7 @@ describe('Json Schema Conversion Test File', function () { const cursor = reader.getCursor(); const row = await cursor.next(); const rowData = { - ...row1, + ...row1FromParquetFile, }; assert.deepEqual(row, rowData); }); diff --git a/test/test-files/json-schema-test-file.result.json b/test/test-files/json-schema-test-file.result.json index 3fd11733..40635142 100644 --- a/test/test-files/json-schema-test-file.result.json +++ b/test/test-files/json-schema-test-file.result.json @@ -4,7 +4,7 @@ "type_length": null, "repetition_type": null, "name": "root", - "num_children": 8, + "num_children": 9, "converted_type": null, "scale": null, "precision": null, @@ -298,5 +298,17 @@ "precision": null, "field_id": null, "logicalType": null + }, + { + "type": 1, + "type_length": null, + "repetition_type": 1, + "name": "time_field", + "num_children": null, + "converted_type": 7, + "scale": null, + "precision": null, + "field_id": null, + "logicalType": null } ] diff --git a/test/test-files/json-schema-test-file.schema.result.json b/test/test-files/json-schema-test-file.schema.result.json index f0fe1883..32c43ec5 100644 --- a/test/test-files/json-schema-test-file.schema.result.json +++ b/test/test-files/json-schema-test-file.schema.result.json @@ -135,6 +135,34 @@ } } } + }, + "time_field": { + "optional": true, + "type": "INT32", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" } }, "fields": { @@ -403,6 +431,39 @@ } }, "originalType": "LIST" + }, + "time_field": { + "name": "time_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "OPTIONAL", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 1 } }, "fieldList": [ @@ -1114,6 +1175,39 @@ "compression": "UNCOMPRESSED", "rLevelMax": 2, "dLevelMax": 5 + }, + { + "name": "time_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "OPTIONAL", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 1 } ] } diff --git a/test/test-files/time.schema.json b/test/test-files/time.schema.json new file mode 100644 index 00000000..eec63301 --- /dev/null +++ b/test/test-files/time.schema.json @@ -0,0 +1,27 @@ +{ + "$id": "https://example.com/time.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A schema to test the TIME logical types in Parquet", + "type": "object", + "properties": { + "time_millis_field": { + "type": "object", + "properties": { + "value": { + "type": "number | string" + }, + "unit": { + "type": "string", + "enum": ["MILLIS", "MICROS", "NANOS"], + "description": "The unit for the time value" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "unit", "isAdjustedToUTC"] + } + }, + "required": ["time_millis_field"] +} diff --git a/test/test-files/time.schema.result.json b/test/test-files/time.schema.result.json new file mode 100644 index 00000000..1d4a94b3 --- /dev/null +++ b/test/test-files/time.schema.result.json @@ -0,0 +1,102 @@ +{ + "schema": { + "time_millis_field": { + "optional": false, + "type": "INT32", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + }, + "fields": { + "time_millis_field": { + "name": "time_millis_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_millis_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + }, + "fieldList": [ + { + "name": "time_millis_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_millis_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + ] +} diff --git a/test/test-files/time.schema_micros.json b/test/test-files/time.schema_micros.json new file mode 100644 index 00000000..652417df --- /dev/null +++ b/test/test-files/time.schema_micros.json @@ -0,0 +1,30 @@ +{ + "$id": "https://example.com/time-micros.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A schema to test the TIME logical type in Parquet with MICROS unit", + "type": "object", + "properties": { + "time_field": { + "type": "object", + "properties": { + "value": { + "type": "number", + "description": "Time value in MICROS" + }, + "unit": { + "type": "string", + "enum": ["MILLIS", "MICROS", "NANOS"], + "default": "MICROS", + "description": "The unit is fixed to MICROS" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "isAdjustedToUTC"], + "additionalProperties": false + } + }, + "required": ["time_field"] +} diff --git a/test/test-files/time.schema_micros.result.json b/test/test-files/time.schema_micros.result.json new file mode 100644 index 00000000..889c98ef --- /dev/null +++ b/test/test-files/time.schema_micros.result.json @@ -0,0 +1,102 @@ +{ + "schema": { + "time_field": { + "optional": false, + "type": "INT64", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": {}, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + }, + "fields": { + "time_field": { + "name": "time_field", + "primitiveType": "INT64", + "originalType": "TIME_MICROS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": {}, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + }, + "fieldList": [ + { + "name": "time_field", + "primitiveType": "INT64", + "originalType": "TIME_MICROS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": {}, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + ] +} diff --git a/test/test-files/time.schema_millis.json b/test/test-files/time.schema_millis.json new file mode 100644 index 00000000..5cb0c4a8 --- /dev/null +++ b/test/test-files/time.schema_millis.json @@ -0,0 +1,30 @@ +{ + "$id": "https://example.com/time-millis.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A schema to test the TIME logical type in Parquet with MILLIS unit", + "type": "object", + "properties": { + "time_field": { + "type": "object", + "properties": { + "value": { + "type": "number", + "description": "Time value in MILLIS" + }, + "unit": { + "type": "string", + "enum": ["MILLIS", "MICROS", "NANOS"], + "default": "MILLIS", + "description": "The unit is fixed to MILLIS" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "isAdjustedToUTC"], + "additionalProperties": false + } + }, + "required": ["time_field"] +} diff --git a/test/test-files/time.schema_millis.result.json b/test/test-files/time.schema_millis.result.json new file mode 100644 index 00000000..ddb9f433 --- /dev/null +++ b/test/test-files/time.schema_millis.result.json @@ -0,0 +1,102 @@ +{ + "schema": { + "time_field": { + "optional": false, + "type": "INT32", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + }, + "fields": { + "time_field": { + "name": "time_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + }, + "fieldList": [ + { + "name": "time_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + ] +} diff --git a/test/test-files/time.schema_nanos.json b/test/test-files/time.schema_nanos.json new file mode 100644 index 00000000..ceb12c8c --- /dev/null +++ b/test/test-files/time.schema_nanos.json @@ -0,0 +1,30 @@ +{ + "$id": "https://example.com/time-nanos.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A schema to test the TIME logical type in Parquet with NANOS unit", + "type": "object", + "properties": { + "time_field": { + "type": "object", + "properties": { + "value": { + "type": "number", + "description": "Time value in NANOS" + }, + "unit": { + "type": "string", + "enum": ["MILLIS", "MICROS", "NANOS"], + "default": "NANOS", + "description": "The unit is fixed to NANOS" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "isAdjustedToUTC"], + "additionalProperties": false + } + }, + "required": ["time_field"] +} diff --git a/test/test-files/time.schema_nanos.result.json b/test/test-files/time.schema_nanos.result.json new file mode 100644 index 00000000..44e653bb --- /dev/null +++ b/test/test-files/time.schema_nanos.result.json @@ -0,0 +1,100 @@ +{ + "schema": { + "time_field": { + "optional": false, + "type": "INT64", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": null, + "NANOS": {} + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + }, + "fields": { + "time_field": { + "name": "time_field", + "primitiveType": "INT64", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": null, + "NANOS": {} + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + }, + "fieldList": [ + { + "name": "time_field", + "primitiveType": "INT64", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": null, + "NANOS": {} + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + ] +}