Skip to content

Commit

Permalink
Move TODO comments to README and address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
andygrove committed May 29, 2024
1 parent a19fb4d commit f052013
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 24 deletions.
17 changes: 17 additions & 0 deletions fuzz-testing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,23 @@ Although it is a simple tool it has already been useful in finding many bugs.

Comet Fuzz is inspired by the [SparkFuzz](https://ir.cwi.nl/pub/30222) paper from Databricks and CWI.

## Roadmap

Planned areas of improvement:

- Support for all data types, expressions, and operators supported by Comet
- Explicit casts
- Unary and binary arithmetic expressions
- IF and CASE WHEN expressions
- Complex (nested) expressions
- Literal scalar values in queries
- Add option to avoid grouping and sorting on floating-point columns
- Improve join query support:
- Support joins without join keys
- Support composite join keys
- Support multiple join keys
- Support join conditions that use expressions

## Usage

Build the jar file first.
Expand Down
1 change: 0 additions & 1 deletion fuzz-testing/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ under the License.
<dependency>
<groupId>org.rogach</groupId>
<artifactId>scallop_${scala.binary.version}</artifactId>
<version>5.1.0</version>
</dependency>
</dependencies>

Expand Down
22 changes: 12 additions & 10 deletions fuzz-testing/src/main/scala/org/apache/comet/fuzz/DataGen.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.comet.fuzz

import java.nio.charset.Charset
import java.sql.Timestamp

import scala.util.Random
Expand Down Expand Up @@ -46,22 +47,21 @@ object DataGen {
numRows: Int,
numColumns: Int): Unit = {

// TODO add examples of all supported types, including complex types
val dataTypes = Seq(
(DataTypes.BooleanType, 0.2),
(DataTypes.ByteType, 0.2),
(DataTypes.ShortType, 0.2),
(DataTypes.IntegerType, 0.2),
(DataTypes.LongType, 0.2),
(DataTypes.FloatType, 0.2),
(DataTypes.DoubleType, 0.2),
// TODO add support for all Comet supported types
// (DataTypes.createDecimalType(10,2), 0.2),
// (DataTypes.createDecimalType(10,0), 0.2),
// (DataTypes.createDecimalType(4,0), 0.2),
//TODO add Decimal support
//(DataTypes.createDecimalType(10,2), 0.2),
(DataTypes.DateType, 0.2),
(DataTypes.TimestampType, 0.2),
// (DataTypes.TimestampNTZType, 0.2),
(DataTypes.StringType, 0.2))
(DataTypes.TimestampNTZType, 0.2),
(DataTypes.StringType, 0.2),
(DataTypes.BinaryType, 0.2))

// generate schema using random data types
val fields = Range(0, numColumns)
Expand All @@ -76,14 +76,14 @@ object DataGen {
Row.fromSeq(cols.map(_(rowIndex)))
})

// TODO random partitioning and bucketing
// TODO random parquet write options
val df = spark.createDataFrame(spark.sparkContext.parallelize(rows), schema)
df.write.mode(SaveMode.Overwrite).parquet(filename)
}

def generateColumn(r: Random, dataType: DataType, numRows: Int): Seq[Any] = {
dataType match {
case DataTypes.BooleanType =>
generateColumn(r, DataTypes.LongType, numRows).map(_.asInstanceOf[Long].toShort).map(s => s%2 ==0)
case DataTypes.ByteType =>
generateColumn(r, DataTypes.LongType, numRows).map(_.asInstanceOf[Long].toByte)
case DataTypes.ShortType =>
Expand Down Expand Up @@ -142,9 +142,11 @@ object DataGen {
case _ => r.nextString(8)
}
})
case DataTypes.BinaryType =>
generateColumn(r, DataTypes.StringType, numRows).map(_.asInstanceOf[String].getBytes(Charset.defaultCharset()))
case DataTypes.DateType =>
Range(0, numRows).map(_ => new java.sql.Date(1716645600011L + r.nextInt()))
case DataTypes.TimestampType =>
case DataTypes.TimestampType | DataTypes.TimestampNTZType =>
Range(0, numRows).map(_ => new Timestamp(1716645600011L + r.nextInt()))
case _ => throw new IllegalStateException(s"Cannot generate data for $dataType yet")
}
Expand Down
13 changes: 0 additions & 13 deletions fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryGen.scala
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,6 @@ object QueryGen {
case 0 => generateJoin(r, spark, numFiles)
case 1 => generateAggregate(r, spark, numFiles)
case 2 => generateScalar(r, spark, numFiles)
// TODO add explicit casts
// TODO add unary and binary arithmetic expressions
// TODO add IF and CASE WHEN expressions
// TODO support nested expressions
}
if (!uniqueQueries.contains(sql)) {
uniqueQueries += sql
Expand Down Expand Up @@ -131,16 +127,13 @@ object QueryGen {

val func = Utils.randomChoice(aggFunc, r)
val args = Range(0, func.num_args)
// TODO support using literals as well as columns
.map(_ => Utils.randomChoice(table.columns, r))

// TODO avoid grouping and sorting on floating-point columns
val groupingCols = Range(0, 2).map(_ => Utils.randomChoice(table.columns, r))

if (groupingCols.isEmpty) {
s"SELECT ${args.mkString(", ")}, ${func.name}(${args.mkString(", ")}) AS x " +
s"FROM $tableName " +
// TODO avoid sorting on floating-point columns
s"ORDER BY ${args.mkString(", ")}"
} else {
s"SELECT ${groupingCols.mkString(", ")}, ${func.name}(${args.mkString(", ")}) " +
Expand All @@ -156,12 +149,10 @@ object QueryGen {

val func = Utils.randomChoice(scalarFunc, r)
val args = Range(0, func.num_args)
// TODO support using literals as well as columns
.map(_ => Utils.randomChoice(table.columns, r))

s"SELECT ${args.mkString(", ")}, ${func.name}(${args.mkString(", ")}) AS x " +
s"FROM $tableName " +
// TODO avoid sorting on floating-point columns
s"ORDER BY ${args.mkString(", ")}"
}

Expand All @@ -171,16 +162,12 @@ object QueryGen {
val leftTable = spark.table(leftTableName)
val rightTable = spark.table(rightTableName)

// TODO support no join keys
// TODO support multiple join keys
// TODO support join conditions that use expressions
val leftCol = Utils.randomChoice(leftTable.columns, r)
val rightCol = Utils.randomChoice(rightTable.columns, r)

val joinTypes = Seq(("INNER", 0.4), ("LEFT", 0.3), ("RIGHT", 0.3))
val joinType = Utils.randomWeightedChoice(joinTypes)

// TODO avoid sorting on floating-point columns
val leftColProjection = leftTable.columns.map(c => s"l.$c").mkString(", ")
val rightColProjection = rightTable.columns.map(c => s"r.$c").mkString(", ")
"SELECT " +
Expand Down
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,12 @@ under the License.
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.rogach</groupId>
<artifactId>scallop_${scala.binary.version}</artifactId>
<version>5.1.0</version>
</dependency>

</dependencies>

</dependencyManagement>
Expand Down

0 comments on commit f052013

Please sign in to comment.