diff --git a/docs/source/user-guide/compatibility-template.md b/docs/source/user-guide/compatibility-template.md index deaca2d247..64f8713546 100644 --- a/docs/source/user-guide/compatibility-template.md +++ b/docs/source/user-guide/compatibility-template.md @@ -44,7 +44,19 @@ Cast operations in Comet fall into three levels of support: - **Unsupported**: Comet does not provide a native version of this cast expression and the query stage will fall back to Spark. -The following table shows the current cast operations supported by Comet. Any cast that does not appear in this -table (such as those involving complex types and timestamp_ntz, for example) are not supported by Comet. +### Compatible Casts - +The following cast operations are generally compatible with Spark except for the differences noted here. + + + +### Incompatible Casts + +The following cast operations are not compatible with Spark for all inputs and are disabled by default. + + + +### Unsupported Casts + +Any cast not listed in the previous tables is currently unsupported. We are working on adding more. See the +[tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. diff --git a/docs/source/user-guide/compatibility.md b/docs/source/user-guide/compatibility.md index 9a2478d376..57a4271f41 100644 --- a/docs/source/user-guide/compatibility.md +++ b/docs/source/user-guide/compatibility.md @@ -38,122 +38,89 @@ Cast operations in Comet fall into three levels of support: - **Compatible**: The results match Apache Spark - **Incompatible**: The results may match Apache Spark for some inputs, but there are known issues where some inputs - will result in incorrect results or exceptions. The query stage will fall back to Spark by default. Setting - `spark.comet.cast.allowIncompatible=true` will allow all incompatible casts to run natively in Comet, but this is not - recommended for production use. +will result in incorrect results or exceptions. The query stage will fall back to Spark by default. Setting +`spark.comet.cast.allowIncompatible=true` will allow all incompatible casts to run natively in Comet, but this is not +recommended for production use. - **Unsupported**: Comet does not provide a native version of this cast expression and the query stage will fall back to - Spark. - -The following table shows the current cast operations supported by Comet. Any cast that does not appear in this -table (such as those involving complex types and timestamp_ntz, for example) are not supported by Comet. - -| From Type | To Type | Compatible? | Notes | -| --------- | --------- | ------------ | ----------------------------------- | -| boolean | byte | Compatible | | -| boolean | short | Compatible | | -| boolean | integer | Compatible | | -| boolean | long | Compatible | | -| boolean | float | Compatible | | -| boolean | double | Compatible | | -| boolean | decimal | Unsupported | | -| boolean | string | Compatible | | -| boolean | timestamp | Unsupported | | -| byte | boolean | Compatible | | -| byte | short | Compatible | | -| byte | integer | Compatible | | -| byte | long | Compatible | | -| byte | float | Compatible | | -| byte | double | Compatible | | -| byte | decimal | Compatible | | -| byte | string | Compatible | | -| byte | binary | Unsupported | | -| byte | timestamp | Unsupported | | -| short | boolean | Compatible | | -| short | byte | Compatible | | -| short | integer | Compatible | | -| short | long | Compatible | | -| short | float | Compatible | | -| short | double | Compatible | | -| short | decimal | Compatible | | -| short | string | Compatible | | -| short | binary | Unsupported | | -| short | timestamp | Unsupported | | -| integer | boolean | Compatible | | -| integer | byte | Compatible | | -| integer | short | Compatible | | -| integer | long | Compatible | | -| integer | float | Compatible | | -| integer | double | Compatible | | -| integer | decimal | Compatible | | -| integer | string | Compatible | | -| integer | binary | Unsupported | | -| integer | timestamp | Unsupported | | -| long | boolean | Compatible | | -| long | byte | Compatible | | -| long | short | Compatible | | -| long | integer | Compatible | | -| long | float | Compatible | | -| long | double | Compatible | | -| long | decimal | Compatible | | -| long | string | Compatible | | -| long | binary | Unsupported | | -| long | timestamp | Unsupported | | -| float | boolean | Compatible | | -| float | byte | Unsupported | | -| float | short | Unsupported | | -| float | integer | Unsupported | | -| float | long | Unsupported | | -| float | double | Compatible | | -| float | decimal | Unsupported | | -| float | string | Incompatible | | -| float | timestamp | Unsupported | | -| double | boolean | Compatible | | -| double | byte | Unsupported | | -| double | short | Unsupported | | -| double | integer | Unsupported | | -| double | long | Unsupported | | -| double | float | Compatible | | -| double | decimal | Incompatible | | -| double | string | Incompatible | | -| double | timestamp | Unsupported | | -| decimal | boolean | Unsupported | | -| decimal | byte | Unsupported | | -| decimal | short | Unsupported | | -| decimal | integer | Unsupported | | -| decimal | long | Unsupported | | -| decimal | float | Compatible | | -| decimal | double | Compatible | | -| decimal | string | Unsupported | | -| decimal | timestamp | Unsupported | | -| string | boolean | Compatible | | -| string | byte | Compatible | | -| string | short | Compatible | | -| string | integer | Compatible | | -| string | long | Compatible | | -| string | float | Unsupported | | -| string | double | Unsupported | | -| string | decimal | Unsupported | | -| string | binary | Compatible | | -| string | date | Unsupported | | -| string | timestamp | Incompatible | Not all valid formats are supported | -| binary | string | Incompatible | | -| date | boolean | Unsupported | | -| date | byte | Unsupported | | -| date | short | Unsupported | | -| date | integer | Unsupported | | -| date | long | Unsupported | | -| date | float | Unsupported | | -| date | double | Unsupported | | -| date | decimal | Unsupported | | -| date | string | Compatible | | -| date | timestamp | Unsupported | | -| timestamp | boolean | Unsupported | | -| timestamp | byte | Unsupported | | -| timestamp | short | Unsupported | | -| timestamp | integer | Unsupported | | -| timestamp | long | Compatible | | -| timestamp | float | Unsupported | | -| timestamp | double | Unsupported | | -| timestamp | decimal | Unsupported | | -| timestamp | string | Compatible | | -| timestamp | date | Compatible | | +Spark. + +### Compatible Casts + +The following cast operations are generally compatible with Spark except for the differences noted here. + +| From Type | To Type | Notes | +|-|-|-| +| boolean | byte | | +| boolean | short | | +| boolean | integer | | +| boolean | long | | +| boolean | float | | +| boolean | double | | +| boolean | string | | +| byte | boolean | | +| byte | short | | +| byte | integer | | +| byte | long | | +| byte | float | | +| byte | double | | +| byte | decimal | | +| byte | string | | +| short | boolean | | +| short | byte | | +| short | integer | | +| short | long | | +| short | float | | +| short | double | | +| short | decimal | | +| short | string | | +| integer | boolean | | +| integer | byte | | +| integer | short | | +| integer | long | | +| integer | float | | +| integer | double | | +| integer | string | | +| long | boolean | | +| long | byte | | +| long | short | | +| long | integer | | +| long | float | | +| long | double | | +| long | string | | +| float | boolean | | +| float | double | | +| float | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | +| double | boolean | | +| double | float | | +| double | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | +| decimal | float | | +| decimal | double | | +| string | boolean | | +| string | byte | | +| string | short | | +| string | integer | | +| string | long | | +| string | binary | | +| date | string | | +| timestamp | long | | +| timestamp | decimal | | +| timestamp | string | | +| timestamp | date | | + +### Incompatible Casts + +The following cast operations are not compatible with Spark for all inputs and are disabled by default. + +| From Type | To Type | Notes | +|-|-|-| +| integer | decimal | No overflow check | +| long | decimal | No overflow check | +| float | decimal | No overflow check | +| double | decimal | No overflow check | +| string | timestamp | Not all valid formats are supported | +| binary | string | Only works for binary data representing valid UTF-8 strings | + +### Unsupported Casts + +Any cast not listed in the previous tables is currently unsupported. We are working on adding more. See the +[tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 8c414c7fed..1e28efd526 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -25,7 +25,7 @@ import scala.io.Source import org.apache.spark.sql.catalyst.expressions.Cast -import org.apache.comet.expressions.{CometCast, Compatible, Incompatible, Unsupported} +import org.apache.comet.expressions.{CometCast, Compatible, Incompatible} /** * Utility for generating markdown documentation from the configs. @@ -64,23 +64,36 @@ object GenerateDocs { val outputFilename = "docs/source/user-guide/compatibility.md" val w = new BufferedOutputStream(new FileOutputStream(outputFilename)) for (line <- Source.fromFile(templateFilename).getLines()) { - if (line.trim == "") { - w.write("| From Type | To Type | Compatible? | Notes |\n".getBytes) - w.write("|-|-|-|-|\n".getBytes) + if (line.trim == "") { + w.write("| From Type | To Type | Notes |\n".getBytes) + w.write("|-|-|-|\n".getBytes) for (fromType <- CometCast.supportedTypes) { for (toType <- CometCast.supportedTypes) { if (Cast.canCast(fromType, toType) && fromType != toType) { val fromTypeName = fromType.typeName.replace("(10,2)", "") val toTypeName = toType.typeName.replace("(10,2)", "") CometCast.isSupported(fromType, toType, None, "LEGACY") match { - case Compatible => - w.write(s"| $fromTypeName | $toTypeName | Compatible | |\n".getBytes) - case Incompatible(Some(reason)) => - w.write(s"| $fromTypeName | $toTypeName | Incompatible | $reason |\n".getBytes) - case Incompatible(None) => - w.write(s"| $fromTypeName | $toTypeName | Incompatible | |\n".getBytes) - case Unsupported => - w.write(s"| $fromTypeName | $toTypeName | Unsupported | |\n".getBytes) + case Compatible(notes) => + val notesStr = notes.getOrElse("").trim + w.write(s"| $fromTypeName | $toTypeName | $notesStr |\n".getBytes) + case _ => + } + } + } + } + } else if (line.trim == "") { + w.write("| From Type | To Type | Notes |\n".getBytes) + w.write("|-|-|-|\n".getBytes) + for (fromType <- CometCast.supportedTypes) { + for (toType <- CometCast.supportedTypes) { + if (Cast.canCast(fromType, toType) && fromType != toType) { + val fromTypeName = fromType.typeName.replace("(10,2)", "") + val toTypeName = toType.typeName.replace("(10,2)", "") + CometCast.isSupported(fromType, toType, None, "LEGACY") match { + case Incompatible(notes) => + val notesStr = notes.getOrElse("").trim + w.write(s"| $fromTypeName | $toTypeName | $notesStr |\n".getBytes) + case _ => } } } diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 5641c94a87..57e07b8cda 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -24,10 +24,10 @@ import org.apache.spark.sql.types.{DataType, DataTypes, DecimalType} sealed trait SupportLevel /** We support this feature with full compatibility with Spark */ -object Compatible extends SupportLevel +case class Compatible(notes: Option[String] = None) extends SupportLevel /** We support this feature but results can be different from Spark */ -case class Incompatible(reason: Option[String] = None) extends SupportLevel +case class Incompatible(notes: Option[String] = None) extends SupportLevel /** We do not support this feature */ object Unsupported extends SupportLevel @@ -58,7 +58,7 @@ object CometCast { evalMode: String): SupportLevel = { if (fromType == toType) { - return Compatible + return Compatible() } (fromType, toType) match { @@ -83,10 +83,14 @@ object CometCast { canCastFromDecimal(toType) case (DataTypes.BooleanType, _) => canCastFromBoolean(toType) - case ( - DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType, - _) => + case (DataTypes.ByteType, _) => + canCastFromByte(toType) + case (DataTypes.ShortType, _) => + canCastFromShort(toType) + case (DataTypes.IntegerType, _) => canCastFromInt(toType) + case (DataTypes.LongType, _) => + canCastFromLong(toType) case (DataTypes.FloatType, _) => canCastFromFloat(toType) case (DataTypes.DoubleType, _) => @@ -101,12 +105,12 @@ object CometCast { evalMode: String): SupportLevel = { toType match { case DataTypes.BooleanType => - Compatible + Compatible() case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType => - Compatible + Compatible() case DataTypes.BinaryType => - Compatible + Compatible() case DataTypes.FloatType | DataTypes.DoubleType => // https://github.com/apache/datafusion-comet/issues/326 Unsupported @@ -130,18 +134,21 @@ object CometCast { private def canCastToString(fromType: DataType): SupportLevel = { fromType match { - case DataTypes.BooleanType => Compatible + case DataTypes.BooleanType => Compatible() case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType => - Compatible - case DataTypes.DateType => Compatible - case DataTypes.TimestampType => Compatible + Compatible() + case DataTypes.DateType => Compatible() + case DataTypes.TimestampType => Compatible() case DataTypes.FloatType | DataTypes.DoubleType => - // https://github.com/apache/datafusion-comet/issues/326 - Incompatible() + Compatible( + Some( + "There can be differences in precision. " + + "For example, the input \"1.4E-45\" will produce 1.0E-45 " + + "instead of 1.4E-45")) case DataTypes.BinaryType => // https://github.com/apache/datafusion-comet/issues/377 - Incompatible() + Incompatible(Some("Only works for binary data representing valid UTF-8 strings")) case _ => Unsupported } } @@ -155,9 +162,10 @@ object CometCast { Unsupported case DataTypes.LongType => // https://github.com/apache/datafusion-comet/issues/352 - Compatible - case DataTypes.StringType => Compatible - case DataTypes.DateType => Compatible + Compatible() + case DataTypes.StringType => Compatible() + case DataTypes.DateType => Compatible() + case _: DecimalType => Compatible() case _ => Unsupported } } @@ -165,31 +173,72 @@ object CometCast { private def canCastFromBoolean(toType: DataType): SupportLevel = toType match { case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType | DataTypes.FloatType | DataTypes.DoubleType => - Compatible + Compatible() case _ => Unsupported } + private def canCastFromByte(toType: DataType): SupportLevel = toType match { + case DataTypes.BooleanType => + Compatible() + case DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType => + Compatible() + case DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => + Compatible() + case _ => + Unsupported + } + + private def canCastFromShort(toType: DataType): SupportLevel = toType match { + case DataTypes.BooleanType => + Compatible() + case DataTypes.ByteType | DataTypes.IntegerType | DataTypes.LongType => + Compatible() + case DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => + Compatible() + case _ => + Unsupported + } + private def canCastFromInt(toType: DataType): SupportLevel = toType match { - case DataTypes.BooleanType | DataTypes.ByteType | DataTypes.ShortType | - DataTypes.IntegerType | DataTypes.LongType | DataTypes.FloatType | DataTypes.DoubleType | - _: DecimalType => - Compatible - case _ => Unsupported + case DataTypes.BooleanType => + Compatible() + case DataTypes.ByteType | DataTypes.ShortType | DataTypes.LongType => + Compatible() + case DataTypes.FloatType | DataTypes.DoubleType => + Compatible() + case _: DecimalType => + Incompatible(Some("No overflow check")) + case _ => + Unsupported + } + + private def canCastFromLong(toType: DataType): SupportLevel = toType match { + case DataTypes.BooleanType => + Compatible() + case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType => + Compatible() + case DataTypes.FloatType | DataTypes.DoubleType => + Compatible() + case _: DecimalType => + Incompatible(Some("No overflow check")) + case _ => + Unsupported } private def canCastFromFloat(toType: DataType): SupportLevel = toType match { - case DataTypes.BooleanType | DataTypes.DoubleType => Compatible + case DataTypes.BooleanType | DataTypes.DoubleType => Compatible() + case _: DecimalType => Incompatible(Some("No overflow check")) case _ => Unsupported } private def canCastFromDouble(toType: DataType): SupportLevel = toType match { - case DataTypes.BooleanType | DataTypes.FloatType => Compatible - case _: DecimalType => Incompatible() + case DataTypes.BooleanType | DataTypes.FloatType => Compatible() + case _: DecimalType => Incompatible(Some("No overflow check")) case _ => Unsupported } private def canCastFromDecimal(toType: DataType): SupportLevel = toType match { - case DataTypes.FloatType | DataTypes.DoubleType => Compatible + case DataTypes.FloatType | DataTypes.DoubleType => Compatible() case _ => Unsupported } diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 1e8877c8d0..86e9f10b90 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -636,7 +636,7 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde { reason.map(str => s" ($str)").getOrElse("") castSupport match { - case Compatible => + case Compatible(_) => castToProto(timeZoneId, dt, childExpr, evalModeStr) case Incompatible(reason) => if (CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.get()) { diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index 483301e02e..1d698a49a8 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, DataTypes} -import org.apache.comet.expressions.CometCast +import org.apache.comet.expressions.{CometCast, Compatible} class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { import testImplicits._ @@ -66,6 +66,23 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } else if (!testExists) { fail(s"Missing test: $expectedTestName") + } else { + val testIgnored = + tags.get(expectedTestName).exists(s => s.contains("org.scalatest.Ignore")) + CometCast.isSupported(fromType, toType, None, "LEGACY") match { + case Compatible(_) => + if (testIgnored) { + fail( + s"Cast from $fromType to $toType is reported as compatible " + + "with Spark but the test is ignored") + } + case _ => + if (!testIgnored) { + fail( + s"We claim that cast from $fromType to $toType is not compatible " + + "with Spark but the test is not ignored") + } + } } } else if (testExists) { fail(s"Found test for cast that Spark does not support: $expectedTestName") @@ -347,7 +364,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { Short.MaxValue.toFloat, 0.0f) ++ Range(0, dataSize).map(_ => r.nextFloat()) - withNulls(values).toDF("a") + castTest(withNulls(values).toDF("a"), DataTypes.StringType) } ignore("cast FloatType to TimestampType") { @@ -401,7 +418,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { Double.NegativeInfinity, 0.0d) ++ Range(0, dataSize).map(_ => r.nextDouble()) - withNulls(values).toDF("a") + castTest(withNulls(values).toDF("a"), DataTypes.StringType) } ignore("cast DoubleType to TimestampType") { @@ -559,6 +576,14 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } + ignore("cast StringType to TimestampType") { + // https://github.com/apache/datafusion-comet/issues/328 + withSQLConf((CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key, "true")) { + val values = Seq("2020-01-01T12:34:56.123456", "T2") ++ generateStrings(timestampPattern, 8) + castTest(values.toDF("a"), DataTypes.TimestampType) + } + } + test("cast StringType to TimestampType disabled for non-UTC timezone") { withSQLConf((SQLConf.SESSION_LOCAL_TIMEZONE.key, "America/Denver")) { val values = Seq("2020-01-01T12:34:56.123456", "T2").toDF("a") @@ -569,15 +594,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - ignore("cast StringType to TimestampType (fuzz test)") { - // https://github.com/apache/datafusion-comet/issues/328 - withSQLConf((CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key, "true")) { - val values = Seq("2020-01-01T12:34:56.123456", "T2") ++ generateStrings(timestampPattern, 8) - castTest(values.toDF("a"), DataTypes.TimestampType) - } - } - - test("cast StringType to TimestampType") { + test("cast StringType to TimestampType - subset of supported values") { withSQLConf( SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC", CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key -> "true") { @@ -606,8 +623,12 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { // CAST from BinaryType ignore("cast BinaryType to StringType") { - // TODO implement this // https://github.com/apache/datafusion-comet/issues/377 + castTest(generateBinary(), DataTypes.StringType) + } + + test("cast BinaryType to StringType - valid UTF-8 inputs") { + castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.StringType) } // CAST from DateType @@ -795,7 +816,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { Seq( "2024-01-01T12:34:56.123456", "2024-01-01T01:00:00Z", - "2024-12-31T01:00:00-02:00", + "9999-12-31T01:00:00-02:00", "2024-12-31T01:00:00+02:00") withNulls(values) .toDF("str") @@ -814,6 +835,16 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { Range(0, dataSize).map(_ => generateString(r, chars, maxLen)) } + private def generateBinary(): DataFrame = { + val r = new Random(0) + val bytes = new Array[Byte](8) + val values: Seq[Array[Byte]] = Range(0, dataSize).map(_ => { + r.nextBytes(bytes) + bytes.clone() + }) + values.toDF("a") + } + private def withNulls[T](values: Seq[T]): Seq[Option[T]] = { values.map(v => Some(v)) ++ Seq(None) } diff --git a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala index 43c145ed2d..0c943b64be 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala @@ -253,7 +253,8 @@ class CometExecSuite extends CometTestBase { dataTypes.map { subqueryType => withSQLConf( CometConf.COMET_EXEC_SHUFFLE_ENABLED.key -> "true", - CometConf.COMET_COLUMNAR_SHUFFLE_ENABLED.key -> "true") { + CometConf.COMET_COLUMNAR_SHUFFLE_ENABLED.key -> "true", + CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key -> "true") { withParquetTable((0 until 5).map(i => (i, i + 1)), "tbl") { var column1 = s"CAST(max(_1) AS $subqueryType)" if (subqueryType == "BINARY") {