From 6d1668bdd05503cdf63b816988fababcd11cd057 Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 01/10] Revert "Fix String test" This reverts commit 641fe4e4aa95689d8c79836bfde1ad2a562a1da7. --- .../mrpowers/spark/fast/tests/DatasetComparerTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala index dd9185d..f8d02a2 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala @@ -81,8 +81,8 @@ class DatasetComparerTest extends AnyFreeSpec with DatasetComparer with SparkSes val colourGroup = e.getMessage.extractColorGroup val expectedColourGroup = colourGroup.get(Console.GREEN) val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("String(StructField(long,LongType2,true,{}))"))) - assert(actualColourGroup.contains(Seq("String(StructField(long,LongType,true,{}))"))) + assert(expectedColourGroup.contains(Seq("StructField(long,LongType2,true,{})"))) + assert(actualColourGroup.contains(Seq("StructField(long,LongType,true,{})"))) } "correctly mark unequal element for Dataset[Seq[String]]" in { From e4a1e688ff9230d213e22b7eb4b52f030834f2ec Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 02/10] Revert "remove space" This reverts commit fb62226987e7cfda242b016a8e542783a24876f8. --- .../mrpowers/spark/fast/tests/DatasetComparerTest.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala index f8d02a2..5791a5d 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala @@ -153,12 +153,13 @@ class DatasetComparerTest extends AnyFreeSpec with DatasetComparer with SparkSes val e = intercept[DatasetContentMismatch] { assertSmallDatasetEquality(sourceDS, expectedDS) } + println(e) val colourGroup = e.getMessage.extractColorGroup val expectedColourGroup = colourGroup.get(Console.GREEN) val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("(apple,banana1)"))) - assert(actualColourGroup.contains(Seq("(apple,banana)"))) + assert(expectedColourGroup.contains(Seq("(apple, banana1)"))) + assert(actualColourGroup.contains(Seq("(apple, banana)"))) } "works with really long columns" in { From c6689558f72ba505978edd276e16d415de437a9c Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 03/10] Revert "handle Iterable cases" This reverts commit 0c4e2c1574ed32c154e130fa8c16a479f341d280. --- .../spark/fast/tests/ProductUtil.scala | 12 +-- .../fast/tests/DatasetComparerTest.scala | 96 ------------------- 2 files changed, 5 insertions(+), 103 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala index b4b464c..d2d1f15 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala @@ -10,12 +10,10 @@ import scala.reflect.ClassTag object ProductUtil { private[mrpowers] def productOrRowToSeq(product: Any): Seq[Any] = { product match { - case null => Seq.empty - case a: Array[_] => a - case i: Iterable[_] => i.toSeq - case r: Row => r.toSeq - case p: Product => p.productIterator.toSeq - case s => Seq(s) + case null => Seq.empty + case r: Row => r.toSeq + case p: Product => p.productIterator.toSeq + case s => Seq(s) } } private[mrpowers] def showProductDiff[T: ClassTag]( @@ -47,7 +45,7 @@ object ProductUtil { List(Red(prodToString(actualSeq)), Green(emptyProd)) else { val withEquals = actualSeq - .zipAll(expectedSeq, "MISSING", "MISSING") + .zip(expectedSeq) .map { case (actualRowField, expectedRowField) => (actualRowField, expectedRowField, actualRowField == expectedRowField) } diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala index 5791a5d..fb5e9e4 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala @@ -2,10 +2,8 @@ package com.github.mrpowers.spark.fast.tests import org.apache.spark.sql.types._ import SparkSessionExt._ -import com.github.mrpowers.spark.fast.tests.ProductUtil.showProductDiff import com.github.mrpowers.spark.fast.tests.SchemaComparer.DatasetSchemaMismatch import com.github.mrpowers.spark.fast.tests.StringExt.StringOps -import org.apache.spark.sql.Row import org.scalatest.freespec.AnyFreeSpec object Person { @@ -68,100 +66,6 @@ class DatasetComparerTest extends AnyFreeSpec with DatasetComparer with SparkSes assert(actualColourGroup.contains(Seq("Person(bob,1)", "alice"))) } - "correctly mark unequal element for Dataset[String]" in { - import spark.implicits._ - val sourceDS = Seq("word", "StringType", "StructField(long,LongType,true,{})").toDS - - val expectedDS = List("word", "StringType", "StructField(long,LongType2,true,{})").toDS - - val e = intercept[DatasetContentMismatch] { - assertSmallDatasetEquality(sourceDS, expectedDS) - } - - val colourGroup = e.getMessage.extractColorGroup - val expectedColourGroup = colourGroup.get(Console.GREEN) - val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("StructField(long,LongType2,true,{})"))) - assert(actualColourGroup.contains(Seq("StructField(long,LongType,true,{})"))) - } - - "correctly mark unequal element for Dataset[Seq[String]]" in { - import spark.implicits._ - - val sourceDS = Seq( - Seq("apple", "banana", "cherry"), - Seq("dog", "cat"), - Seq("red", "green", "blue") - ).toDS - - val expectedDS = Seq( - Seq("apple", "banana2"), - Seq("dog", "cat"), - Seq("red", "green", "blue") - ).toDS - - val e = intercept[DatasetContentMismatch] { - assertSmallDatasetEquality(sourceDS, expectedDS) - } - - val colourGroup = e.getMessage.extractColorGroup - val expectedColourGroup = colourGroup.get(Console.GREEN) - val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("banana2", "MISSING"))) - assert(actualColourGroup.contains(Seq("banana", "cherry"))) - } - - "correctly mark unequal element for Dataset[Array[String]]" in { - import spark.implicits._ - - val sourceDS = Seq( - Array("apple", "banana", "cherry"), - Array("dog", "cat"), - Array("red", "green", "blue") - ).toDS - - val expectedDS = Seq( - Array("apple", "banana2"), - Array("dog", "cat"), - Array("red", "green", "blue") - ).toDS - - val e = intercept[DatasetContentMismatch] { - assertSmallDatasetEquality(sourceDS, expectedDS) - } - - val colourGroup = e.getMessage.extractColorGroup - val expectedColourGroup = colourGroup.get(Console.GREEN) - val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("banana2", "MISSING"))) - assert(actualColourGroup.contains(Seq("banana", "cherry"))) - } - - "correctly mark unequal element for Dataset[Map[String, String]]" in { - import spark.implicits._ - - val sourceDS = Seq( - Map("apple" -> "banana", "apple1" -> "banana1"), - Map("apple" -> "banana", "apple1" -> "banana1") - ).toDS - - val expectedDS = Seq( - Map("apple" -> "banana1", "apple1" -> "banana1"), - Map("apple" -> "banana", "apple1" -> "banana1") - ).toDS - - val e = intercept[DatasetContentMismatch] { - assertSmallDatasetEquality(sourceDS, expectedDS) - } - println(e) - - val colourGroup = e.getMessage.extractColorGroup - val expectedColourGroup = colourGroup.get(Console.GREEN) - val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("(apple, banana1)"))) - assert(actualColourGroup.contains(Seq("(apple, banana)"))) - } - "works with really long columns" in { val sourceDS = Seq( Person("juanisareallygoodguythatilikealotOK", 5), From 7fc21c643ba2e6605ec92d9987b56d65cf2cb0d2 Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 04/10] Revert "formatting" This reverts commit 54419ec1952d1e37571cc0b6231e5c387821ca5b. --- .../com/github/mrpowers/spark/fast/tests/ProductUtil.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala index d2d1f15..000cbf1 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala @@ -21,7 +21,7 @@ object ProductUtil { actual: Seq[T], expected: Seq[T], truncate: Int = 20, - minColWidth: Int = 3 + minColWidth: Int = 3, ): String = { val runTimeClass = implicitly[ClassTag[T]].runtimeClass From bdf75d37280b7fc9c5290d9494b16c8f64b7e77d Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 05/10] Revert "Disallow input default val" This reverts commit aebbe6ebcd176b41cb48186dbd8263a10aee1d0f. --- .../github/mrpowers/spark/fast/tests/DatasetComparer.scala | 2 +- .../com/github/mrpowers/spark/fast/tests/ProductUtil.scala | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala index e77a59d..e71a115 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala @@ -71,7 +71,7 @@ Expected DataFrame Row Count: '$expectedCount' val e = expectedDS.collect().toSeq if (!a.approximateSameElements(e, equals)) { val arr = ("Actual Content", "Expected Content") - val msg = "Diffs\n" ++ ProductUtil.showProductDiff(arr, a, e, truncate) + val msg = "Diffs\n" ++ ProductUtil.showProductDiff[T](arr, a, e, truncate) throw DatasetContentMismatch(msg) } } diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala index 000cbf1..72c8d7a 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala @@ -10,9 +10,9 @@ import scala.reflect.ClassTag object ProductUtil { private[mrpowers] def productOrRowToSeq(product: Any): Seq[Any] = { product match { - case null => Seq.empty case r: Row => r.toSeq case p: Product => p.productIterator.toSeq + case null => Seq.empty case s => Seq(s) } } @@ -22,6 +22,7 @@ object ProductUtil { expected: Seq[T], truncate: Int = 20, minColWidth: Int = 3, + defaultVal: T = null.asInstanceOf[T] ): String = { val runTimeClass = implicitly[ClassTag[T]].runtimeClass @@ -31,7 +32,7 @@ object ProductUtil { val sb = new StringBuilder - val fullJoin = actual.zipAll(expected, null, null) + val fullJoin = actual.zipAll(expected, defaultVal, defaultVal) val diff = fullJoin.map { case (actualRow, expectedRow) => if (actualRow == expectedRow) { From 786fd5e5dac25746a2f429c80f8fa16c8913e097 Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 06/10] Revert "Make Row Diff not display class name" This reverts commit 92766971447e00089cc97cf9b03233ae7373fcb8. --- .../github/mrpowers/spark/fast/tests/ProductUtil.scala | 9 +++++---- .../spark/fast/tests/DataFrameComparerTest.scala | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala index 72c8d7a..a776f9b 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala @@ -26,8 +26,9 @@ object ProductUtil { ): String = { val runTimeClass = implicitly[ClassTag[T]].runtimeClass - val (className, lBracket, rBracket) = if (runTimeClass == classOf[Row]) ("", "[", "]") else (runTimeClass.getSimpleName, "(", ")") - val prodToString: Seq[Any] => String = s => s.mkString(s"$className$lBracket", ",", rBracket) + val className = runTimeClass.getSimpleName + val border = if (runTimeClass == classOf[Row]) ("[", "]") else ("(", ")") + val prodToString: Seq[Any] => String = s => s.mkString(s"$className${border._1}", ",", border._2) val emptyProd = "MISSING" val sb = new StringBuilder @@ -61,9 +62,9 @@ object ProductUtil { case (actualRowField, expectedRowField, false) => (Red(actualRowField.toString), Green(expectedRowField.toString)) } - val start = DarkGray(s"$className$lBracket") + val start = DarkGray(s"$className${border._1}") val sep = DarkGray(",") - val end = DarkGray(rBracket) + val end = DarkGray(border._2) List( coloredDiff.map(_._1).mkStr(start, sep, end), coloredDiff.map(_._2).mkStr(start, sep, end) diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala index f4ebfb2..09daa83 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala @@ -74,8 +74,8 @@ class DataFrameComparerTest extends AnyFreeSpec with DataFrameComparer with Spar val colourGroup = e.getMessage.extractColorGroup val expectedColourGroup = colourGroup.get(Console.GREEN) val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("uk", "[steve,10,aus]"))) - assert(actualColourGroup.contains(Seq("france", "[mark,11,usa]"))) + assert(expectedColourGroup.contains(Seq("uk", "Row[steve,10,aus]"))) + assert(actualColourGroup.contains(Seq("france", "Row[mark,11,usa]"))) } "works well for wide DataFrames" in { From fabb7d8d12d7bb93ba2a548128eaadee6af286d5 Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 07/10] Revert "Handle single valued case" This reverts commit b24c39f3312e715f1a7a86e7d9f08ef0641f6de9. --- .../com/github/mrpowers/spark/fast/tests/ProductUtil.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala index a776f9b..4840ee0 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala @@ -10,10 +10,10 @@ import scala.reflect.ClassTag object ProductUtil { private[mrpowers] def productOrRowToSeq(product: Any): Seq[Any] = { product match { + case null => Seq.empty case r: Row => r.toSeq case p: Product => p.productIterator.toSeq - case null => Seq.empty - case s => Seq(s) + case _ => throw new IllegalArgumentException("Only Row and Product types are supported") } } private[mrpowers] def showProductDiff[T: ClassTag]( From 77433c6796ceab7448e1edee0d818f3f9365bf69 Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 08/10] Revert "Add Test for schema diff" This reverts commit 95ae6c8f284170ec27cfd8e4521d42e5ddd08853. --- .../spark/fast/tests/ProductUtil.scala | 2 +- .../spark/fast/tests/SchemaComparer.scala | 3 +- .../fast/tests/DataFrameComparerTest.scala | 37 +-------- .../fast/tests/DatasetComparerTest.scala | 81 ++----------------- 4 files changed, 12 insertions(+), 111 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala index 4840ee0..ab2da0e 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala @@ -29,7 +29,7 @@ object ProductUtil { val className = runTimeClass.getSimpleName val border = if (runTimeClass == classOf[Row]) ("[", "]") else ("(", ")") val prodToString: Seq[Any] => String = s => s.mkString(s"$className${border._1}", ",", border._2) - val emptyProd = "MISSING" + val emptyProd = s"$className${border._1}${border._2}" val sb = new StringBuilder diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala index 89f6783..f67fa73 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala @@ -11,7 +11,8 @@ object SchemaComparer { ("Actual Schema", "Expected Schema"), actualDS.schema.fields, expectedDS.schema.fields, - truncate = 200 + truncate = 200, + defaultVal = StructField("SPARK_FAST_TEST_MISSING_FIELD", NullType) ) } diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala index 09daa83..a8b84f1 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala @@ -1,6 +1,6 @@ package com.github.mrpowers.spark.fast.tests -import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType} +import org.apache.spark.sql.types.{DoubleType, IntegerType, StringType} import SparkSessionExt._ import com.github.mrpowers.spark.fast.tests.SchemaComparer.DatasetSchemaMismatch import com.github.mrpowers.spark.fast.tests.StringExt.StringOps @@ -310,41 +310,6 @@ class DataFrameComparerTest extends AnyFreeSpec with DataFrameComparer with Spar ) assertLargeDataFrameEquality(sourceDF, expectedDF, ignoreColumnOrder = true) } - - "correctly mark unequal schema field" in { - val sourceDF = spark.createDF( - List( - (1, 2.0), - (5, 3.0) - ), - List( - ("number", IntegerType, true), - ("float", DoubleType, true) - ) - ) - - val expectedDF = spark.createDF( - List( - (1, "word", 1L), - (5, "word", 2L) - ), - List( - ("number", IntegerType, true), - ("word", StringType, true), - ("long", LongType, true) - ) - ) - - val e = intercept[DatasetSchemaMismatch] { - assertSmallDataFrameEquality(sourceDF, expectedDF) - } - - val colourGroup = e.getMessage.extractColorGroup - val expectedColourGroup = colourGroup.get(Console.GREEN) - val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("word", "StringType", "StructField(long,LongType,true,{})"))) - assert(actualColourGroup.contains(Seq("float", "DoubleType", "MISSING"))) - } } "assertApproximateDataFrameEquality" - { diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala index fb5e9e4..f645661 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala @@ -207,13 +207,17 @@ class DatasetComparerTest extends AnyFreeSpec with DatasetComparer with SparkSes ) ) - intercept[DatasetSchemaMismatch] { + val e = intercept[DatasetSchemaMismatch] { assertLargeDatasetEquality(sourceDF, expectedDF) } - - intercept[DatasetSchemaMismatch] { + println(e) + val e2 = intercept[DatasetSchemaMismatch] { assertSmallDatasetEquality(sourceDF, expectedDF) } + println(e2) + + sourceDF.schema.printTreeString() + expectedDF.schema.printTreeString() } "throws an error if the DataFrames content is different" in { @@ -442,41 +446,6 @@ class DatasetComparerTest extends AnyFreeSpec with DatasetComparer with SparkSes assertLargeDatasetEquality(ds1, ds2, ignoreColumnOrder = true) assertLargeDatasetEquality(ds2, ds1, ignoreColumnOrder = true) } - - "correctly mark unequal schema field" in { - val sourceDF = spark.createDF( - List( - (1, 2.0), - (5, 3.0) - ), - List( - ("number", IntegerType, true), - ("float", DoubleType, true) - ) - ) - - val expectedDF = spark.createDF( - List( - (1, "word", 1L), - (5, "word", 2L) - ), - List( - ("number", IntegerType, true), - ("word", StringType, true), - ("long", LongType, true) - ) - ) - - val e = intercept[DatasetSchemaMismatch] { - assertLargeDatasetEquality(sourceDF, expectedDF) - } - - val colourGroup = e.getMessage.extractColorGroup - val expectedColourGroup = colourGroup.get(Console.GREEN) - val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("word", "StringType", "StructField(long,LongType,true,{})"))) - assert(actualColourGroup.contains(Seq("float", "DoubleType", "MISSING"))) - } } "assertSmallDatasetEquality" - { @@ -636,43 +605,9 @@ class DatasetComparerTest extends AnyFreeSpec with DatasetComparer with SparkSes Person("alice", 5) ).toDS.select("age", "name").as(ds1.encoder) + assertSmallDatasetEquality(ds1, ds2, ignoreColumnOrder = true) assertSmallDatasetEquality(ds2, ds1, ignoreColumnOrder = true) } - - "correctly mark unequal schema field" in { - val sourceDF = spark.createDF( - List( - (1, 2.0), - (5, 3.0) - ), - List( - ("number", IntegerType, true), - ("float", DoubleType, true) - ) - ) - - val expectedDF = spark.createDF( - List( - (1, "word", 1L), - (5, "word", 2L) - ), - List( - ("number", IntegerType, true), - ("word", StringType, true), - ("long", LongType, true) - ) - ) - - val e = intercept[DatasetSchemaMismatch] { - assertSmallDatasetEquality(sourceDF, expectedDF) - } - - val colourGroup = e.getMessage.extractColorGroup - val expectedColourGroup = colourGroup.get(Console.GREEN) - val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("word", "StringType", "StructField(long,LongType,true,{})"))) - assert(actualColourGroup.contains(Seq("float", "DoubleType", "MISSING"))) - } } "defaultSortDataset" - { From ce9e7b128471bc73f144d914ba4ed8e27e42bcf9 Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 09/10] Revert "Determine bracket based on type" This reverts commit 72eb3a96b6888ea92c9ac92de0f3bdc54da5b5ce. --- .../github/mrpowers/spark/fast/tests/ProductUtil.scala | 10 ++++------ .../mrpowers/spark/fast/tests/SchemaComparer.scala | 3 ++- .../spark/fast/tests/DataFrameComparerTest.scala | 4 ++-- .../spark/fast/tests/DatasetComparerTest.scala | 4 ++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala index ab2da0e..ffbe668 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala @@ -22,14 +22,12 @@ object ProductUtil { expected: Seq[T], truncate: Int = 20, minColWidth: Int = 3, - defaultVal: T = null.asInstanceOf[T] + defaultVal: T = null.asInstanceOf[T], + border: (String, String) = ("[", "]") ): String = { - - val runTimeClass = implicitly[ClassTag[T]].runtimeClass - val className = runTimeClass.getSimpleName - val border = if (runTimeClass == classOf[Row]) ("[", "]") else ("(", ")") + val className = implicitly[ClassTag[T]].runtimeClass.getSimpleName val prodToString: Seq[Any] => String = s => s.mkString(s"$className${border._1}", ",", border._2) - val emptyProd = s"$className${border._1}${border._2}" + val emptyProd = s"$className()" val sb = new StringBuilder diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala index f67fa73..9316d76 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala @@ -12,7 +12,8 @@ object SchemaComparer { actualDS.schema.fields, expectedDS.schema.fields, truncate = 200, - defaultVal = StructField("SPARK_FAST_TEST_MISSING_FIELD", NullType) + defaultVal = StructField("SPARK_FAST_TEST_MISSING_FIELD", NullType), + border = ("(", ")") ) } diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala index a8b84f1..d8620b3 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DataFrameComparerTest.scala @@ -74,8 +74,8 @@ class DataFrameComparerTest extends AnyFreeSpec with DataFrameComparer with Spar val colourGroup = e.getMessage.extractColorGroup val expectedColourGroup = colourGroup.get(Console.GREEN) val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("uk", "Row[steve,10,aus]"))) - assert(actualColourGroup.contains(Seq("france", "Row[mark,11,usa]"))) + assert(expectedColourGroup.contains(Seq("uk", "[steve,10,aus]"))) + assert(actualColourGroup.contains(Seq("france", "[mark,11,usa]"))) } "works well for wide DataFrames" in { diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala index f645661..284911d 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala @@ -62,8 +62,8 @@ class DatasetComparerTest extends AnyFreeSpec with DatasetComparer with SparkSes val colourGroup = e.getMessage.extractColorGroup val expectedColourGroup = colourGroup.get(Console.GREEN) val actualColourGroup = colourGroup.get(Console.RED) - assert(expectedColourGroup.contains(Seq("Person(frank,10)", "lucy"))) - assert(actualColourGroup.contains(Seq("Person(bob,1)", "alice"))) + assert(expectedColourGroup.contains(Seq("[frank,10]", "lucy"))) + assert(actualColourGroup.contains(Seq("[bob,1]", "alice"))) } "works with really long columns" in { From a099d302c9bd170f2b32ad207365f5bd4fbd9119 Mon Sep 17 00:00:00 2001 From: zeotuan <48720253+zeotuan@users.noreply.github.com> Date: Sat, 12 Oct 2024 17:34:30 +1100 Subject: [PATCH 10/10] Revert "Add Table support for StructField Diff" This reverts commit 58c631fcca6588dfd66f519e78db47bcbff4d063. --- ...{ProductUtil.scala => DataframeUtil.scala} | 63 ++++++++----------- .../spark/fast/tests/DatasetComparer.scala | 8 +-- .../spark/fast/tests/SchemaComparer.scala | 32 ++++++---- .../fast/tests/DatasetComparerTest.scala | 51 ++------------- 4 files changed, 58 insertions(+), 96 deletions(-) rename core/src/main/scala/com/github/mrpowers/spark/fast/tests/{ProductUtil.scala => DataframeUtil.scala} (65%) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataframeUtil.scala similarity index 65% rename from core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala rename to core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataframeUtil.scala index ffbe668..6cfde87 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/ProductUtil.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DataframeUtil.scala @@ -1,48 +1,35 @@ package com.github.mrpowers.spark.fast.tests import com.github.mrpowers.spark.fast.tests.ufansi.Color.{DarkGray, Green, Red} -import com.github.mrpowers.spark.fast.tests.ufansi.FansiExtensions.StrOps import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.Row +import com.github.mrpowers.spark.fast.tests.ufansi.FansiExtensions.StrOps +object DataframeUtil { -import scala.reflect.ClassTag - -object ProductUtil { - private[mrpowers] def productOrRowToSeq(product: Any): Seq[Any] = { - product match { - case null => Seq.empty - case r: Row => r.toSeq - case p: Product => p.productIterator.toSeq - case _ => throw new IllegalArgumentException("Only Row and Product types are supported") - } - } - private[mrpowers] def showProductDiff[T: ClassTag]( + private[mrpowers] def showDataframeDiff( header: (String, String), - actual: Seq[T], - expected: Seq[T], + actual: Seq[Row], + expected: Seq[Row], truncate: Int = 20, - minColWidth: Int = 3, - defaultVal: T = null.asInstanceOf[T], - border: (String, String) = ("[", "]") + minColWidth: Int = 3 ): String = { - val className = implicitly[ClassTag[T]].runtimeClass.getSimpleName - val prodToString: Seq[Any] => String = s => s.mkString(s"$className${border._1}", ",", border._2) - val emptyProd = s"$className()" val sb = new StringBuilder - val fullJoin = actual.zipAll(expected, defaultVal, defaultVal) - + val fullJoin = actual.zipAll(expected, Row(), Row()) val diff = fullJoin.map { case (actualRow, expectedRow) => - if (actualRow == expectedRow) { + if (equals(actualRow, expectedRow)) { List(DarkGray(actualRow.toString), DarkGray(expectedRow.toString)) } else { - val actualSeq = productOrRowToSeq(actualRow) - val expectedSeq = productOrRowToSeq(expectedRow) + val actualSeq = actualRow.toSeq + val expectedSeq = expectedRow.toSeq if (actualSeq.isEmpty) - List(Red(emptyProd), Green(prodToString(expectedSeq))) + List( + Red("[]"), + Green(expectedSeq.mkString("[", ",", "]")) + ) else if (expectedSeq.isEmpty) - List(Red(prodToString(actualSeq)), Green(emptyProd)) + List(Red(actualSeq.mkString("[", ",", "]")), Green("[]")) else { val withEquals = actualSeq .zip(expectedSeq) @@ -51,8 +38,12 @@ object ProductUtil { } val allFieldsAreNotEqual = !withEquals.exists(_._3) if (allFieldsAreNotEqual) { - List(Red(prodToString(actualSeq)), Green(prodToString(expectedSeq))) + List( + Red(actualSeq.mkString("[", ",", "]")), + Green(expectedSeq.mkString("[", ",", "]")) + ) } else { + val coloredDiff = withEquals .map { case (actualRowField, expectedRowField, true) => @@ -60,9 +51,9 @@ object ProductUtil { case (actualRowField, expectedRowField, false) => (Red(actualRowField.toString), Green(expectedRowField.toString)) } - val start = DarkGray(s"$className${border._1}") + val start = DarkGray("[") val sep = DarkGray(",") - val end = DarkGray(border._2) + val end = DarkGray("]") List( coloredDiff.map(_._1).mkStr(start, sep, end), coloredDiff.map(_._2).mkStr(start, sep, end) @@ -78,12 +69,11 @@ object ProductUtil { val colWidths = Array.fill(numCols)(minColWidth) // Compute the width of each column - headerSeq.zipWithIndex.foreach({ case (cell, i) => + for ((cell, i) <- headerSeq.zipWithIndex) { colWidths(i) = math.max(colWidths(i), cell.length) - }) - - diff.foreach { row => - row.zipWithIndex.foreach { case (cell, i) => + } + for (row <- diff) { + for ((cell, i) <- row.zipWithIndex) { colWidths(i) = math.max(colWidths(i), cell.length) } } @@ -127,4 +117,5 @@ object ProductUtil { sb.toString } + } diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala index e71a115..70b30dc 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala @@ -38,7 +38,7 @@ Expected DataFrame Row Count: '$expectedCount' /** * Raises an error unless `actualDS` and `expectedDS` are equal */ - def assertSmallDatasetEquality[T: ClassTag]( + def assertSmallDatasetEquality[T]( actualDS: Dataset[T], expectedDS: Dataset[T], ignoreNullable: Boolean = false, @@ -53,7 +53,7 @@ Expected DataFrame Row Count: '$expectedCount' assertSmallDatasetContentEquality(actual, expectedDS, orderedComparison, truncate, equals) } - def assertSmallDatasetContentEquality[T: ClassTag]( + def assertSmallDatasetContentEquality[T]( actualDS: Dataset[T], expectedDS: Dataset[T], orderedComparison: Boolean, @@ -66,12 +66,12 @@ Expected DataFrame Row Count: '$expectedCount' assertSmallDatasetContentEquality(defaultSortDataset(actualDS), defaultSortDataset(expectedDS), truncate, equals) } - def assertSmallDatasetContentEquality[T: ClassTag](actualDS: Dataset[T], expectedDS: Dataset[T], truncate: Int, equals: (T, T) => Boolean): Unit = { + def assertSmallDatasetContentEquality[T](actualDS: Dataset[T], expectedDS: Dataset[T], truncate: Int, equals: (T, T) => Boolean): Unit = { val a = actualDS.collect().toSeq val e = expectedDS.collect().toSeq if (!a.approximateSameElements(e, equals)) { val arr = ("Actual Content", "Expected Content") - val msg = "Diffs\n" ++ ProductUtil.showProductDiff[T](arr, a, e, truncate) + val msg = "Diffs\n" ++ DataframeUtil.showDataframeDiff(arr, a.asRows, e.asRows, truncate) throw DatasetContentMismatch(msg) } } diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala index 9316d76..ce1edfe 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala @@ -1,20 +1,29 @@ package com.github.mrpowers.spark.fast.tests -import com.github.mrpowers.spark.fast.tests.ProductUtil.showProductDiff import org.apache.spark.sql.Dataset -import org.apache.spark.sql.types.{ArrayType, DataType, MapType, NullType, StructField, StructType} +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType} object SchemaComparer { + case class DatasetSchemaMismatch(smth: String) extends Exception(smth) private def betterSchemaMismatchMessage[T](actualDS: Dataset[T], expectedDS: Dataset[T]): String = { - showProductDiff( - ("Actual Schema", "Expected Schema"), - actualDS.schema.fields, - expectedDS.schema.fields, - truncate = 200, - defaultVal = StructField("SPARK_FAST_TEST_MISSING_FIELD", NullType), - border = ("(", ")") - ) + "\nActual Schema Field | Expected Schema Field\n" + actualDS.schema + .zipAll( + expectedDS.schema, + "", + "" + ) + .map { + case (sf1, sf2) if sf1 == sf2 => + ufansi.Color.Blue(s"$sf1 | $sf2") + case ("", sf2) => + ufansi.Color.Red(s"MISSING | $sf2") + case (sf1, "") => + ufansi.Color.Red(s"$sf1 | MISSING") + case (sf1, sf2) => + ufansi.Color.Red(s"$sf1 | $sf2") + } + .mkString("\n") } def assertSchemaEqual[T]( @@ -27,7 +36,7 @@ object SchemaComparer { require((ignoreColumnNames, ignoreColumnOrder) != (true, true), "Cannot set both ignoreColumnNames and ignoreColumnOrder to true.") if (!SchemaComparer.equals(actualDS.schema, expectedDS.schema, ignoreNullable, ignoreColumnNames, ignoreColumnOrder)) { throw DatasetSchemaMismatch( - "Diffs\n" + betterSchemaMismatchMessage(actualDS, expectedDS) + betterSchemaMismatchMessage(actualDS, expectedDS) ) } } @@ -67,4 +76,5 @@ object SchemaComparer { case _ => dt1 == dt2 } } + } diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala index 284911d..0ab6b27 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/DatasetComparerTest.scala @@ -154,70 +154,31 @@ class DatasetComparerTest extends AnyFreeSpec with DatasetComparer with SparkSes } "throws an error if the DataFrames have different schemas" in { - val nestedSchema = StructType( - Seq( - StructField( - "attributes", - StructType( - Seq( - StructField("PostCode", IntegerType, nullable = true) - ) - ), - nullable = true - ) - ) - ) - - val nestedSchema2 = StructType( - Seq( - StructField( - "attributes", - StructType( - Seq( - StructField("PostCode", StringType, nullable = true) - ) - ), - nullable = true - ) - ) - ) - val sourceDF = spark.createDF( List( - (1, 2.0, null), - (5, 3.0, null) + (1), + (5) ), - List( - ("number", IntegerType, true), - ("float", DoubleType, true), - ("nestedField", nestedSchema, true) - ) + List(("number", IntegerType, true)) ) val expectedDF = spark.createDF( List( - (1, "word", null, 1L), - (5, "word", null, 2L) + (1, "word"), + (5, "word") ), List( ("number", IntegerType, true), - ("word", StringType, true), - ("nestedField", nestedSchema2, true), - ("long", LongType, true) + ("word", StringType, true) ) ) val e = intercept[DatasetSchemaMismatch] { assertLargeDatasetEquality(sourceDF, expectedDF) } - println(e) val e2 = intercept[DatasetSchemaMismatch] { assertSmallDatasetEquality(sourceDF, expectedDF) } - println(e2) - - sourceDF.schema.printTreeString() - expectedDF.schema.printTreeString() } "throws an error if the DataFrames content is different" in {