-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ISSUE-344] Deserialize json in kafka source (#354)
* [ISSUE-344] Support deserializing json in kafka source * [ISSUE-344] Support deserializing json in kafka source * [ISSUE-344] Support deserializing json in kafka source * [ISSUE-344] Support deserializing json in kafka source * [ISSUE-344] Support deserializing json in kafka source * [ISSUE-344] Support deserializing json in kafka source
- Loading branch information
1 parent
f804128
commit 526a107
Showing
11 changed files
with
269 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
30 changes: 30 additions & 0 deletions
30
...r-api/src/main/java/com/antgroup/geaflow/dsl/connector/api/serde/DeserializerFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package com.antgroup.geaflow.dsl.connector.api.serde; | ||
|
||
import com.antgroup.geaflow.common.config.Configuration; | ||
import com.antgroup.geaflow.common.config.keys.ConnectorConfigKeys; | ||
import com.antgroup.geaflow.dsl.connector.api.serde.impl.JsonDeserializer; | ||
import com.antgroup.geaflow.dsl.connector.api.serde.impl.RowTableDeserializer; | ||
import com.antgroup.geaflow.dsl.connector.api.serde.impl.TextDeserializer; | ||
import com.antgroup.geaflow.dsl.connector.api.util.ConnectorConstants; | ||
|
||
public class DeserializerFactory { | ||
|
||
public static <IN> TableDeserializer<IN> loadDeserializer(Configuration conf) { | ||
String connectorFormat = conf.getString(ConnectorConfigKeys.GEAFLOW_DSL_CONNECTOR_FORMAT, | ||
(String) ConnectorConfigKeys.GEAFLOW_DSL_CONNECTOR_FORMAT.getDefaultValue()); | ||
if (connectorFormat.equals(ConnectorConstants.CONNECTOR_FORMAT_JSON)) { | ||
return (TableDeserializer<IN>) new JsonDeserializer(); | ||
} else { | ||
return (TableDeserializer<IN>) new TextDeserializer(); | ||
} | ||
} | ||
|
||
public static <IN> TableDeserializer<IN> loadRowTableDeserializer() { | ||
return (TableDeserializer<IN>) new RowTableDeserializer(); | ||
} | ||
|
||
public static <IN> TableDeserializer<IN> loadTextDeserializer() { | ||
return (TableDeserializer<IN>) new TextDeserializer(); | ||
} | ||
|
||
} |
79 changes: 79 additions & 0 deletions
79
...api/src/main/java/com/antgroup/geaflow/dsl/connector/api/serde/impl/JsonDeserializer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
package com.antgroup.geaflow.dsl.connector.api.serde.impl; | ||
|
||
import com.antgroup.geaflow.common.config.Configuration; | ||
import com.antgroup.geaflow.common.config.keys.ConnectorConfigKeys; | ||
import com.antgroup.geaflow.common.exception.GeaflowRuntimeException; | ||
import com.antgroup.geaflow.common.type.IType; | ||
import com.antgroup.geaflow.dsl.common.data.Row; | ||
import com.antgroup.geaflow.dsl.common.data.impl.ObjectRow; | ||
import com.antgroup.geaflow.dsl.common.types.StructType; | ||
import com.antgroup.geaflow.dsl.common.util.TypeCastUtil; | ||
import com.antgroup.geaflow.dsl.connector.api.serde.TableDeserializer; | ||
import com.fasterxml.jackson.core.JsonProcessingException; | ||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
|
||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Objects; | ||
|
||
public class JsonDeserializer implements TableDeserializer<String> { | ||
|
||
private StructType schema; | ||
|
||
private ObjectMapper mapper; | ||
|
||
private boolean ignoreParseError; | ||
|
||
private boolean failOnMissingField; | ||
|
||
|
||
@Override | ||
public void init(Configuration conf, StructType schema) { | ||
this.schema = Objects.requireNonNull(schema); | ||
this.mapper = new ObjectMapper(); | ||
this.ignoreParseError = conf.getBoolean(ConnectorConfigKeys.GEAFLOW_DSL_CONNECTOR_FORMAT_JSON_IGNORE_PARSE_ERROR); | ||
this.failOnMissingField = conf.getBoolean(ConnectorConfigKeys.GEAFLOW_DSL_CONNECTOR_FORMAT_JSON_FAIL_ON_MISSING_FIELD); | ||
|
||
} | ||
|
||
@Override | ||
public List<Row> deserialize(String record) { | ||
if (record == null || record.isEmpty()) { | ||
return Collections.emptyList(); | ||
} | ||
Object[] values = new Object[schema.size()]; | ||
JsonNode jsonNode = null; | ||
try { | ||
jsonNode = mapper.readTree(record); | ||
} catch (JsonProcessingException e) { | ||
// handle exception according to configuration | ||
if (ignoreParseError) { | ||
// return empty list | ||
return Collections.emptyList(); | ||
} else { | ||
throw new GeaflowRuntimeException("fail to deserialize record " + record , e); | ||
} | ||
} | ||
// if json node is null | ||
for (int i = 0 ; i < schema.size() ; i++) { | ||
String fieldName = schema.getFieldNames().get(i); | ||
if (failOnMissingField) { | ||
if (!jsonNode.has(fieldName)) { | ||
throw new GeaflowRuntimeException("fail to deserialize record " + record + " due to missing field " + fieldName ); | ||
} | ||
} | ||
JsonNode value = jsonNode.get(fieldName); | ||
IType<?> type = schema.getType(i); | ||
// cast the value to the type defined in the schema. | ||
if (value != null) { | ||
values[i] = TypeCastUtil.cast(value.asText(), type); | ||
} else { | ||
values[i] = null; | ||
} | ||
|
||
} | ||
return Collections.singletonList(ObjectRow.create(values)); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
101 changes: 101 additions & 0 deletions
101
...nector-api/src/test/java/com/antgroup/geaflow/dsl/connector/api/JsonDeserializerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
package com.antgroup.geaflow.dsl.connector.api; | ||
|
||
import com.antgroup.geaflow.common.config.Configuration; | ||
import com.antgroup.geaflow.common.exception.GeaflowRuntimeException; | ||
import com.antgroup.geaflow.common.type.primitive.BinaryStringType; | ||
import com.antgroup.geaflow.common.type.primitive.IntegerType; | ||
import com.antgroup.geaflow.dsl.common.data.Row; | ||
import com.antgroup.geaflow.dsl.common.types.StructType; | ||
import com.antgroup.geaflow.dsl.common.types.TableField; | ||
import com.antgroup.geaflow.dsl.connector.api.serde.impl.JsonDeserializer; | ||
import org.junit.Test; | ||
import org.testng.Assert; | ||
|
||
import java.util.Collections; | ||
import java.util.List; | ||
|
||
public class JsonDeserializerTest { | ||
|
||
@Test | ||
public void testDeserialize() { | ||
JsonDeserializer deserializer = new JsonDeserializer(); | ||
StructType dataSchema = new StructType( | ||
new TableField("id", IntegerType.INSTANCE, false), | ||
new TableField("name", BinaryStringType.INSTANCE, true), | ||
new TableField("age", IntegerType.INSTANCE, false) | ||
); | ||
deserializer.init(new Configuration(), dataSchema); | ||
List<Row> row = deserializer.deserialize("{\"id\":1, \"name\":\"amy\", \"age\":10}"); | ||
List<Row> rowWithNull = deserializer.deserialize("{\"id\":1, \"name\":\"amy\"}"); | ||
Assert.assertEquals(row.get(0).getField(0, IntegerType.INSTANCE), 1); | ||
Assert.assertEquals(row.get(0).getField(1, BinaryStringType.INSTANCE).toString(), "amy"); | ||
Assert.assertEquals(row.get(0).getField(2, IntegerType.INSTANCE), 10); | ||
Assert.assertEquals(rowWithNull.get(0).getField(0, IntegerType.INSTANCE), 1); | ||
Assert.assertEquals(rowWithNull.get(0).getField(1, BinaryStringType.INSTANCE).toString(), "amy"); | ||
Assert.assertEquals(rowWithNull.get(0).getField(2, IntegerType.INSTANCE), null); | ||
|
||
} | ||
|
||
|
||
@Test | ||
public void testDeserializeEmptyString() { | ||
JsonDeserializer deserializer = new JsonDeserializer(); | ||
StructType dataSchema = new StructType( | ||
new TableField("id", IntegerType.INSTANCE, false), | ||
new TableField("name", BinaryStringType.INSTANCE, true), | ||
new TableField("age", IntegerType.INSTANCE, false) | ||
); | ||
deserializer.init(new Configuration(), dataSchema); | ||
List<Row> rows = deserializer.deserialize(""); | ||
List<Row> testNullRows = deserializer.deserialize(null); | ||
Assert.assertEquals(rows, Collections.emptyList()); | ||
Assert.assertEquals(testNullRows, Collections.emptyList()); | ||
|
||
} | ||
|
||
@Test(expected = GeaflowRuntimeException.class) | ||
public void testDeserializeParseError() { | ||
JsonDeserializer deserializer = new JsonDeserializer(); | ||
StructType dataSchema = new StructType( | ||
new TableField("id", IntegerType.INSTANCE, false), | ||
new TableField("name", BinaryStringType.INSTANCE, true), | ||
new TableField("age", IntegerType.INSTANCE, false) | ||
); | ||
deserializer.init(new Configuration(), dataSchema); | ||
List<Row> rows = deserializer.deserialize("test"); | ||
} | ||
|
||
@Test | ||
public void testDeserializeIgnoreParseError() { | ||
JsonDeserializer deserializer = new JsonDeserializer(); | ||
StructType dataSchema = new StructType( | ||
new TableField("id", IntegerType.INSTANCE, false), | ||
new TableField("name", BinaryStringType.INSTANCE, true), | ||
new TableField("age", IntegerType.INSTANCE, false) | ||
); | ||
Configuration conf = new Configuration(); | ||
conf.put("geaflow.dsl.connector.format.json.ignore-parse-error", "true"); | ||
deserializer.init(conf, dataSchema); | ||
List<Row> rows = deserializer.deserialize("test"); | ||
Assert.assertEquals(rows, Collections.emptyList()); | ||
} | ||
|
||
@Test(expected = GeaflowRuntimeException.class) | ||
public void testDeserializeFailOnMissingField() { | ||
JsonDeserializer deserializer = new JsonDeserializer(); | ||
StructType dataSchema = new StructType( | ||
new TableField("id", IntegerType.INSTANCE, false), | ||
new TableField("name", BinaryStringType.INSTANCE, true), | ||
new TableField("age", IntegerType.INSTANCE, false) | ||
); | ||
Configuration conf = new Configuration(); | ||
conf.put("geaflow.dsl.connector.format.json.fail-on-missing-field", "true"); | ||
deserializer.init(conf, dataSchema); | ||
List<Row> rowWithMissingField = deserializer.deserialize("{\"id\":1, \"name\":\"amy\"}"); | ||
|
||
} | ||
|
||
|
||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.