From f052013cde390c4a33dff11fcd9881ad481d0f10 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 May 2024 07:55:55 -0600 Subject: [PATCH] Move TODO comments to README and address review comments --- fuzz-testing/README.md | 17 ++++++++++++++ fuzz-testing/pom.xml | 1 - .../scala/org/apache/comet/fuzz/DataGen.scala | 22 ++++++++++--------- .../org/apache/comet/fuzz/QueryGen.scala | 13 ----------- pom.xml | 6 +++++ 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/fuzz-testing/README.md b/fuzz-testing/README.md index 5ff127dfa..d85859019 100644 --- a/fuzz-testing/README.md +++ b/fuzz-testing/README.md @@ -26,6 +26,23 @@ Although it is a simple tool it has already been useful in finding many bugs. Comet Fuzz is inspired by the [SparkFuzz](https://ir.cwi.nl/pub/30222) paper from Databricks and CWI. +## Roadmap + +Planned areas of improvement: + +- Support for all data types, expressions, and operators supported by Comet +- Explicit casts +- Unary and binary arithmetic expressions +- IF and CASE WHEN expressions +- Complex (nested) expressions +- Literal scalar values in queries +- Add option to avoid grouping and sorting on floating-point columns +- Improve join query support: + - Support joins without join keys + - Support composite join keys + - Support multiple join keys + - Support join conditions that use expressions + ## Usage Build the jar file first. diff --git a/fuzz-testing/pom.xml b/fuzz-testing/pom.xml index 66cf9dda5..f69d959f9 100644 --- a/fuzz-testing/pom.xml +++ b/fuzz-testing/pom.xml @@ -54,7 +54,6 @@ under the License. org.rogach scallop_${scala.binary.version} - 5.1.0 diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala index b11d50037..71df059c4 100644 --- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala +++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala @@ -19,6 +19,7 @@ package org.apache.comet.fuzz +import java.nio.charset.Charset import java.sql.Timestamp import scala.util.Random @@ -46,22 +47,21 @@ object DataGen { numRows: Int, numColumns: Int): Unit = { - // TODO add examples of all supported types, including complex types val dataTypes = Seq( + (DataTypes.BooleanType, 0.2), (DataTypes.ByteType, 0.2), (DataTypes.ShortType, 0.2), (DataTypes.IntegerType, 0.2), (DataTypes.LongType, 0.2), (DataTypes.FloatType, 0.2), (DataTypes.DoubleType, 0.2), - // TODO add support for all Comet supported types -// (DataTypes.createDecimalType(10,2), 0.2), -// (DataTypes.createDecimalType(10,0), 0.2), -// (DataTypes.createDecimalType(4,0), 0.2), + //TODO add Decimal support + //(DataTypes.createDecimalType(10,2), 0.2), (DataTypes.DateType, 0.2), (DataTypes.TimestampType, 0.2), -// (DataTypes.TimestampNTZType, 0.2), - (DataTypes.StringType, 0.2)) + (DataTypes.TimestampNTZType, 0.2), + (DataTypes.StringType, 0.2), + (DataTypes.BinaryType, 0.2)) // generate schema using random data types val fields = Range(0, numColumns) @@ -76,14 +76,14 @@ object DataGen { Row.fromSeq(cols.map(_(rowIndex))) }) - // TODO random partitioning and bucketing - // TODO random parquet write options val df = spark.createDataFrame(spark.sparkContext.parallelize(rows), schema) df.write.mode(SaveMode.Overwrite).parquet(filename) } def generateColumn(r: Random, dataType: DataType, numRows: Int): Seq[Any] = { dataType match { + case DataTypes.BooleanType => + generateColumn(r, DataTypes.LongType, numRows).map(_.asInstanceOf[Long].toShort).map(s => s%2 ==0) case DataTypes.ByteType => generateColumn(r, DataTypes.LongType, numRows).map(_.asInstanceOf[Long].toByte) case DataTypes.ShortType => @@ -142,9 +142,11 @@ object DataGen { case _ => r.nextString(8) } }) + case DataTypes.BinaryType => + generateColumn(r, DataTypes.StringType, numRows).map(_.asInstanceOf[String].getBytes(Charset.defaultCharset())) case DataTypes.DateType => Range(0, numRows).map(_ => new java.sql.Date(1716645600011L + r.nextInt())) - case DataTypes.TimestampType => + case DataTypes.TimestampType | DataTypes.TimestampNTZType => Range(0, numRows).map(_ => new Timestamp(1716645600011L + r.nextInt())) case _ => throw new IllegalStateException(s"Cannot generate data for $dataType yet") } diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala index b67e821a5..bd36b6dd7 100644 --- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala +++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala @@ -46,10 +46,6 @@ object QueryGen { case 0 => generateJoin(r, spark, numFiles) case 1 => generateAggregate(r, spark, numFiles) case 2 => generateScalar(r, spark, numFiles) - // TODO add explicit casts - // TODO add unary and binary arithmetic expressions - // TODO add IF and CASE WHEN expressions - // TODO support nested expressions } if (!uniqueQueries.contains(sql)) { uniqueQueries += sql @@ -131,16 +127,13 @@ object QueryGen { val func = Utils.randomChoice(aggFunc, r) val args = Range(0, func.num_args) - // TODO support using literals as well as columns .map(_ => Utils.randomChoice(table.columns, r)) - // TODO avoid grouping and sorting on floating-point columns val groupingCols = Range(0, 2).map(_ => Utils.randomChoice(table.columns, r)) if (groupingCols.isEmpty) { s"SELECT ${args.mkString(", ")}, ${func.name}(${args.mkString(", ")}) AS x " + s"FROM $tableName " + - // TODO avoid sorting on floating-point columns s"ORDER BY ${args.mkString(", ")}" } else { s"SELECT ${groupingCols.mkString(", ")}, ${func.name}(${args.mkString(", ")}) " + @@ -156,12 +149,10 @@ object QueryGen { val func = Utils.randomChoice(scalarFunc, r) val args = Range(0, func.num_args) - // TODO support using literals as well as columns .map(_ => Utils.randomChoice(table.columns, r)) s"SELECT ${args.mkString(", ")}, ${func.name}(${args.mkString(", ")}) AS x " + s"FROM $tableName " + - // TODO avoid sorting on floating-point columns s"ORDER BY ${args.mkString(", ")}" } @@ -171,16 +162,12 @@ object QueryGen { val leftTable = spark.table(leftTableName) val rightTable = spark.table(rightTableName) - // TODO support no join keys - // TODO support multiple join keys - // TODO support join conditions that use expressions val leftCol = Utils.randomChoice(leftTable.columns, r) val rightCol = Utils.randomChoice(rightTable.columns, r) val joinTypes = Seq(("INNER", 0.4), ("LEFT", 0.3), ("RIGHT", 0.3)) val joinType = Utils.randomWeightedChoice(joinTypes) - // TODO avoid sorting on floating-point columns val leftColProjection = leftTable.columns.map(c => s"l.$c").mkString(", ") val rightColProjection = rightTable.columns.map(c => s"r.$c").mkString(", ") "SELECT " + diff --git a/pom.xml b/pom.xml index 94bd6f821..5813143b3 100644 --- a/pom.xml +++ b/pom.xml @@ -410,6 +410,12 @@ under the License. test + + org.rogach + scallop_${scala.binary.version} + 5.1.0 + +