From eb3ff4a3e9e49c0d9bfde1c1482cc8c756ed0d15 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Mon, 11 Nov 2024 18:05:16 +1100 Subject: [PATCH 1/5] Add tree-string color diff --- .../spark/fast/tests/DatasetComparer.scala | 6 +- .../spark/fast/tests/SchemaComparer.scala | 98 ++++++++++++++++++- .../spark/fast/tests/SchemaComparerTest.scala | 68 +++++++++++++ 3 files changed, 165 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala index 2545156..0ff191d 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/DatasetComparer.scala @@ -3,8 +3,8 @@ package com.github.mrpowers.spark.fast.tests import com.github.mrpowers.spark.fast.tests.DatasetComparer.maxUnequalRowsToShow import com.github.mrpowers.spark.fast.tests.SeqLikesExtensions.SeqExtensions import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ +import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.reflect.ClassTag @@ -49,7 +49,7 @@ Expected DataFrame Row Count: '$expectedCount' truncate: Int = 500, equals: (T, T) => Boolean = (o1: T, o2: T) => o1.equals(o2) ): Unit = { - SchemaComparer.assertSchemaEqual(actualDS, expectedDS, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata) + SchemaComparer.assertDatasetSchemaEqual(actualDS, expectedDS, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata) val actual = if (ignoreColumnOrder) orderColumns(actualDS, expectedDS) else actualDS assertSmallDatasetContentEquality(actual, expectedDS, orderedComparison, truncate, equals) } @@ -103,7 +103,7 @@ Expected DataFrame Row Count: '$expectedCount' ignoreMetadata: Boolean = true ): Unit = { // first check if the schemas are equal - SchemaComparer.assertSchemaEqual(actualDS, expectedDS, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata) + SchemaComparer.assertDatasetSchemaEqual(actualDS, expectedDS, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata) val actual = if (ignoreColumnOrder) orderColumns(actualDS, expectedDS) else actualDS assertLargeDatasetContentEquality(actual, expectedDS, equals, orderedComparison) } diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala index 266ffd3..4c8dd7f 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala @@ -1,8 +1,9 @@ package com.github.mrpowers.spark.fast.tests import com.github.mrpowers.spark.fast.tests.ProductUtil.showProductDiff +import com.github.mrpowers.spark.fast.tests.ufansi.Color.{DarkGray, Green, Red} import org.apache.spark.sql.Dataset -import org.apache.spark.sql.types.{ArrayType, DataType, MapType, NullType, StructField, StructType} +import org.apache.spark.sql.types._ object SchemaComparer { case class DatasetSchemaMismatch(smth: String) extends Exception(smth) @@ -15,18 +16,107 @@ object SchemaComparer { ) } - def assertSchemaEqual[T]( + private def treeSchemaMismatchMessage[T](actualSchema: StructType, expectedSchema: StructType): String = { + def flattenStrucType(s: StructType, indent: Int): List[(Int, Int, StructField)] = s + .foldLeft(List.empty[(Int, Int, StructField)]) { case (fieldPair, f) => + // 5 char for each level of indentation, 21 char for gap, and description words + val gap = indent * 5 + 21 + f.name.length + f.dataType.typeName.length + f.nullable.toString.length + val pair = fieldPair :+ (indent, gap, f) + f.dataType match { + case st: StructType => pair ++ flattenStrucType(st, indent + 1) + case _ => pair + } + } + + def depthToIndentStr(depth: Int): String = Range(0, depth).map(_ => "| ").mkString + "|--" + val treeSpaces = 6 + val treeFieldPair1 = flattenStrucType(actualSchema, 0) + val treeFieldPair2 = flattenStrucType(expectedSchema, 0) + val headerGap = treeFieldPair1.groupBy(_._2).maxBy(_._1)._1 + treeSpaces + val treePair = treeFieldPair1 + .zipAll(treeFieldPair2, (0, 0, null), (0, 0, null)) + .map { case ((indent1, _, field1), (indent2, _, field2)) => + val prefix1 = depthToIndentStr(indent1) + val prefix2 = depthToIndentStr(indent2) + val (sprefix1, sprefix2) = if (indent1 != indent2) { + (Red(prefix1), Green(prefix2)) + } else { + (DarkGray(prefix1), DarkGray(prefix2)) + } + + if (field1 != null && field2 != null) { + val (name1, name2) = + if (field1.name != field2.name) + (Red(field1.name), Green(field2.name)) + else + (DarkGray(field1.name), DarkGray(field2.name)) + + val (dtype1, dtype2) = + if (field1.dataType != field2.dataType) + (Red(field1.dataType.typeName), Green(field2.dataType.typeName)) + else + (DarkGray(field1.dataType.typeName), DarkGray(field2.dataType.typeName)) + + val (nullable1, nullable2) = + if (field1.nullable != field2.nullable) + (Red(field1.nullable.toString), Green(field2.nullable.toString)) + else + (DarkGray(field1.nullable.toString), DarkGray(field2.nullable.toString)) + + val structString1 = s"$sprefix1 $name1 : $dtype1 (nullable = $nullable1)" + val structString2 = s"$sprefix2 $name2 : $dtype2 (nullable = $nullable2)" + (structString1, structString2) + } else { + val structString1 = if (field1 != null) { + val name = Red(field1.name) + val dtype = Red(field1.dataType.typeName) + val nullable = Red(field1.nullable.toString) + s"$prefix1 $name : $dtype (nullable = $nullable)" + } else "" + + val structString2 = if (field2 != null) { + val name = Green(field2.name) + val dtype = Green(field2.dataType.typeName) + val nullable = Green(field2.nullable.toString) + s"$prefix2 $name : $dtype (nullable = $nullable)" + } else "" + (structString1, structString2) + } + } + + val schemaGap = treePair.groupBy(_._1.length).maxBy(_._1)._1 + treeSpaces + + treePair + .foldLeft(new StringBuilder("\nActual Schema".padTo(headerGap, ' ') + "Expected Schema\n")) { case (sb, (s1, s2)) => + val gap = if (s1.isEmpty) headerGap else schemaGap + sb.append(s1.padTo(gap, ' ') + s2 + "\n") + } + .toString() + } + + def assertDatasetSchemaEqual[T]( actualDS: Dataset[T], expectedDS: Dataset[T], ignoreNullable: Boolean = false, ignoreColumnNames: Boolean = false, ignoreColumnOrder: Boolean = true, ignoreMetadata: Boolean = true + ): Unit = { + assertSchemaEqual(actualDS.schema, expectedDS.schema, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata) + } + + def assertSchemaEqual( + actualSchema: StructType, + expectedSchema: StructType, + ignoreNullable: Boolean = false, + ignoreColumnNames: Boolean = false, + ignoreColumnOrder: Boolean = true, + ignoreMetadata: Boolean = true ): Unit = { require((ignoreColumnNames, ignoreColumnOrder) != (true, true), "Cannot set both ignoreColumnNames and ignoreColumnOrder to true.") - if (!SchemaComparer.equals(actualDS.schema, expectedDS.schema, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata)) { + if (!SchemaComparer.equals(actualSchema, expectedSchema, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata)) { throw DatasetSchemaMismatch( - "Diffs\n" + betterSchemaMismatchMessage(actualDS, expectedDS) + "Diffs\n" + treeSchemaMismatchMessage(actualSchema, expectedSchema) ) } } diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala index 714d001..09ba501 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala @@ -1,5 +1,6 @@ package com.github.mrpowers.spark.fast.tests +import com.github.mrpowers.spark.fast.tests.SchemaComparer.DatasetSchemaMismatch import org.apache.spark.sql.types._ import org.scalatest.freespec.AnyFreeSpec @@ -241,5 +242,72 @@ class SchemaComparerTest extends AnyFreeSpec { ) assert(SchemaComparer.equals(s1, s2)) } + + "display schema diff as tree" in { + val s1 = StructType( + Seq( + StructField("array", ArrayType(StringType, containsNull = true), true), + StructField("map", MapType(StringType, StringType, valueContainsNull = false), true), + StructField("something", StringType, true), + StructField( + "struct", + StructType( + StructType( + Seq( + StructField("mood", ArrayType(StringType, containsNull = false), true), + StructField("something", StringType, false), + StructField( + "something2", + StructType( + Seq( + StructField("mood2", ArrayType(DoubleType, containsNull = false), true), + StructField("something2", StringType, false) + ) + ), + false + ) + ) + ) + ), + true + ) + ) + ) + val s2 = StructType( + Seq( + StructField("something", StringType, true), + StructField("array", ArrayType(StringType, containsNull = true), true), + StructField("map", MapType(StringType, StringType, valueContainsNull = false), true), + StructField( + "struct", + StructType( + StructType( + Seq( + StructField("something", StringType, false), + StructField("mood", ArrayType(StringType, containsNull = false), true), + StructField( + "something3", + StructType( + Seq( + StructField("mood3", ArrayType(StringType, containsNull = false), true) + ) + ), + false + ) + ) + ) + ), + true + ), + StructField("norma2", StringType, false) + ) + ) + + val e = intercept[DatasetSchemaMismatch] { + SchemaComparer.assertSchemaEqual(s1, s2, ignoreColumnOrder = false) + } + println(e) + + } } } From e41016505506a52c8b37ce38b062f3f678dee96c Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Thu, 14 Nov 2024 19:22:33 +1100 Subject: [PATCH 2/5] Improve maxWidth Cal, Improve test --- .../spark/fast/tests/SchemaComparer.scala | 56 +++++++++++-------- .../fast/tests/SchemaDiffOutputFormat.scala | 7 +++ .../spark/fast/tests/SchemaComparerTest.scala | 20 +++++-- 3 files changed, 55 insertions(+), 28 deletions(-) create mode 100644 core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaDiffOutputFormat.scala diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala index 4c8dd7f..b69572c 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala @@ -1,41 +1,44 @@ package com.github.mrpowers.spark.fast.tests import com.github.mrpowers.spark.fast.tests.ProductUtil.showProductDiff +import com.github.mrpowers.spark.fast.tests.SchemaDiffOutputFormat.SchemaDiffOutputFormat import com.github.mrpowers.spark.fast.tests.ufansi.Color.{DarkGray, Green, Red} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types._ object SchemaComparer { case class DatasetSchemaMismatch(smth: String) extends Exception(smth) - private def betterSchemaMismatchMessage[T](actualDS: Dataset[T], expectedDS: Dataset[T]): String = { + private def betterSchemaMismatchMessage(actualSchema: StructType, expectedSchema: StructType): String = { showProductDiff( ("Actual Schema", "Expected Schema"), - actualDS.schema.fields, - expectedDS.schema.fields, + actualSchema.fields, + expectedSchema.fields, truncate = 200 ) } private def treeSchemaMismatchMessage[T](actualSchema: StructType, expectedSchema: StructType): String = { - def flattenStrucType(s: StructType, indent: Int): List[(Int, Int, StructField)] = s - .foldLeft(List.empty[(Int, Int, StructField)]) { case (fieldPair, f) => + def flattenStrucType(s: StructType, indent: Int): (Seq[(Int, StructField)], Int) = s + .foldLeft((Seq.empty[(Int, StructField)], Int.MinValue)) { case ((fieldPair, maxWidth), f) => // 5 char for each level of indentation, 21 char for gap, and description words - val gap = indent * 5 + 21 + f.name.length + f.dataType.typeName.length + f.nullable.toString.length - val pair = fieldPair :+ (indent, gap, f) + val gap = indent * 5 + 21 + f.name.length + f.dataType.typeName.length + f.nullable.toString.length + val pair = fieldPair :+ (indent, f) + val newMaxWidth = scala.math.max(maxWidth, gap) f.dataType match { - case st: StructType => pair ++ flattenStrucType(st, indent + 1) - case _ => pair + case st: StructType => + val (flattenPair, width) = flattenStrucType(st, indent + 1) + (pair ++ flattenPair, scala.math.max(newMaxWidth, width)) + case _ => (pair, newMaxWidth) } } def depthToIndentStr(depth: Int): String = Range(0, depth).map(_ => "| ").mkString + "|--" val treeSpaces = 6 - val treeFieldPair1 = flattenStrucType(actualSchema, 0) - val treeFieldPair2 = flattenStrucType(expectedSchema, 0) - val headerGap = treeFieldPair1.groupBy(_._2).maxBy(_._1)._1 + treeSpaces - val treePair = treeFieldPair1 - .zipAll(treeFieldPair2, (0, 0, null), (0, 0, null)) - .map { case ((indent1, _, field1), (indent2, _, field2)) => + val (treeFieldPair1, headerGap) = flattenStrucType(actualSchema, 0) + val (treeFieldPair2, _) = flattenStrucType(expectedSchema, 0) + val (treePair, maxWidth) = treeFieldPair1 + .zipAll(treeFieldPair2, (0, null), (0, null)) + .foldLeft((Seq.empty[(String, String)], 0)) { case ((acc, maxWidth), ((indent1, field1), (indent2, field2))) => val prefix1 = depthToIndentStr(indent1) val prefix2 = depthToIndentStr(indent2) val (sprefix1, sprefix2) = if (indent1 != indent2) { @@ -44,7 +47,7 @@ object SchemaComparer { (DarkGray(prefix1), DarkGray(prefix2)) } - if (field1 != null && field2 != null) { + val pair = if (field1 != null && field2 != null) { val (name1, name2) = if (field1.name != field2.name) (Red(field1.name), Green(field2.name)) @@ -82,10 +85,10 @@ object SchemaComparer { } else "" (structString1, structString2) } + (acc :+ pair, math.max(maxWidth, pair._1.length)) } - val schemaGap = treePair.groupBy(_._1.length).maxBy(_._1)._1 + treeSpaces - + val schemaGap = maxWidth + treeSpaces treePair .foldLeft(new StringBuilder("\nActual Schema".padTo(headerGap, ' ') + "Expected Schema\n")) { case (sb, (s1, s2)) => val gap = if (s1.isEmpty) headerGap else schemaGap @@ -100,9 +103,10 @@ object SchemaComparer { ignoreNullable: Boolean = false, ignoreColumnNames: Boolean = false, ignoreColumnOrder: Boolean = true, - ignoreMetadata: Boolean = true + ignoreMetadata: Boolean = true, + outputFormat: SchemaDiffOutputFormat = SchemaDiffOutputFormat.Table ): Unit = { - assertSchemaEqual(actualDS.schema, expectedDS.schema, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata) + assertSchemaEqual(actualDS.schema, expectedDS.schema, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata, outputFormat) } def assertSchemaEqual( @@ -111,13 +115,17 @@ object SchemaComparer { ignoreNullable: Boolean = false, ignoreColumnNames: Boolean = false, ignoreColumnOrder: Boolean = true, - ignoreMetadata: Boolean = true + ignoreMetadata: Boolean = true, + outputFormat: SchemaDiffOutputFormat = SchemaDiffOutputFormat.Table ): Unit = { require((ignoreColumnNames, ignoreColumnOrder) != (true, true), "Cannot set both ignoreColumnNames and ignoreColumnOrder to true.") if (!SchemaComparer.equals(actualSchema, expectedSchema, ignoreNullable, ignoreColumnNames, ignoreColumnOrder, ignoreMetadata)) { - throw DatasetSchemaMismatch( - "Diffs\n" + treeSchemaMismatchMessage(actualSchema, expectedSchema) - ) + val diffString = outputFormat match { + case SchemaDiffOutputFormat.Tree => treeSchemaMismatchMessage(actualSchema, expectedSchema) + case SchemaDiffOutputFormat.Table => betterSchemaMismatchMessage(actualSchema, expectedSchema) + } + + throw DatasetSchemaMismatch(s"Diffs\n$diffString") } } diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaDiffOutputFormat.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaDiffOutputFormat.scala new file mode 100644 index 0000000..acef8ff --- /dev/null +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaDiffOutputFormat.scala @@ -0,0 +1,7 @@ +package com.github.mrpowers.spark.fast.tests + +object SchemaDiffOutputFormat extends Enumeration { + type SchemaDiffOutputFormat = Value + + val Tree, Table = Value +} diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala index 09ba501..e792756 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala @@ -275,8 +275,8 @@ class SchemaComparerTest extends AnyFreeSpec { ) val s2 = StructType( Seq( - StructField("something", StringType, true), StructField("array", ArrayType(StringType, containsNull = true), true), + StructField("something", StringType, true), StructField("map", MapType(StringType, StringType, valueContainsNull = false), true), StructField( "struct", @@ -304,10 +304,22 @@ class SchemaComparerTest extends AnyFreeSpec { ) val e = intercept[DatasetSchemaMismatch] { - SchemaComparer.assertSchemaEqual(s1, s2, ignoreColumnOrder = false) + SchemaComparer.assertSchemaEqual(s1, s2, ignoreColumnOrder = false, outputFormat = SchemaDiffOutputFormat.Tree) } - println(e) - + val expectedMessage = """Diffs + | + |Actual Schema Expected Schema + |\u001b[90m|--\u001b[39m \u001b[90marray\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[90marray\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m|--\u001b[39m \u001b[31mmap\u001b[39m : \u001b[31mmap\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[32msomething\u001b[39m : \u001b[32mstring\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m|--\u001b[39m \u001b[31msomething\u001b[39m : \u001b[31mstring\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[32mmap\u001b[39m : \u001b[32mmap\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m|--\u001b[39m \u001b[90mstruct\u001b[39m : \u001b[31mstruct\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[90mstruct\u001b[39m : \u001b[32mstruct\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m| |--\u001b[39m \u001b[31mmood\u001b[39m : \u001b[31marray\u001b[39m (nullable = \u001b[31mtrue\u001b[39m) \u001b[90m| |--\u001b[39m \u001b[32msomething\u001b[39m : \u001b[32mstring\u001b[39m (nullable = \u001b[32mfalse\u001b[39m) + |\u001b[90m| |--\u001b[39m \u001b[31msomething\u001b[39m : \u001b[31mstring\u001b[39m (nullable = \u001b[31mfalse\u001b[39m) \u001b[90m| |--\u001b[39m \u001b[32mmood\u001b[39m : \u001b[32marray\u001b[39m (nullable = \u001b[32mtrue\u001b[39m) + |\u001b[90m| |--\u001b[39m \u001b[31msomething2\u001b[39m : \u001b[31mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[90m| |--\u001b[39m \u001b[32msomething3\u001b[39m : \u001b[32mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) + |\u001b[90m| | |--\u001b[39m \u001b[31mmood2\u001b[39m : \u001b[31marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m| | |--\u001b[39m \u001b[32mmood3\u001b[39m : \u001b[32marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[31m| | |--\u001b[39m \u001b[31msomething2\u001b[39m : \u001b[90mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[32m|--\u001b[39m \u001b[32mnorma2\u001b[39m : \u001b[90mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) + |""".stripMargin + assert(e.getMessage == expectedMessage) } } } From 3a3dc3befb257ea4aae178021f3e02db0bb6c761 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Sun, 17 Nov 2024 12:38:57 +1100 Subject: [PATCH 3/5] Fix alignment, More Comprehensive test --- .../spark/fast/tests/SchemaComparer.scala | 10 +- .../spark/fast/tests/SchemaComparerTest.scala | 233 +++++++++++++++++- 2 files changed, 237 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala index b69572c..7a003f4 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala @@ -34,7 +34,7 @@ object SchemaComparer { def depthToIndentStr(depth: Int): String = Range(0, depth).map(_ => "| ").mkString + "|--" val treeSpaces = 6 - val (treeFieldPair1, headerGap) = flattenStrucType(actualSchema, 0) + val (treeFieldPair1, tree1MaxWidth) = flattenStrucType(actualSchema, 0) val (treeFieldPair2, _) = flattenStrucType(expectedSchema, 0) val (treePair, maxWidth) = treeFieldPair1 .zipAll(treeFieldPair2, (0, null), (0, null)) @@ -74,14 +74,14 @@ object SchemaComparer { val name = Red(field1.name) val dtype = Red(field1.dataType.typeName) val nullable = Red(field1.nullable.toString) - s"$prefix1 $name : $dtype (nullable = $nullable)" + s"$sprefix1 $name : $dtype (nullable = $nullable)" } else "" val structString2 = if (field2 != null) { val name = Green(field2.name) val dtype = Green(field2.dataType.typeName) val nullable = Green(field2.nullable.toString) - s"$prefix2 $name : $dtype (nullable = $nullable)" + s"$sprefix2 $name : $dtype (nullable = $nullable)" } else "" (structString1, structString2) } @@ -89,10 +89,12 @@ object SchemaComparer { } val schemaGap = maxWidth + treeSpaces + val headerGap = tree1MaxWidth + treeSpaces treePair .foldLeft(new StringBuilder("\nActual Schema".padTo(headerGap, ' ') + "Expected Schema\n")) { case (sb, (s1, s2)) => val gap = if (s1.isEmpty) headerGap else schemaGap - sb.append(s1.padTo(gap, ' ') + s2 + "\n") + val s = if (s2.isEmpty) s1 else s1.padTo(gap, ' ') + sb.append(s + s2 + "\n") } .toString() } diff --git a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala index e792756..580547e 100644 --- a/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala +++ b/core/src/test/scala/com/github/mrpowers/spark/fast/tests/SchemaComparerTest.scala @@ -243,7 +243,7 @@ class SchemaComparerTest extends AnyFreeSpec { assert(SchemaComparer.equals(s1, s2)) } - "display schema diff as tree" in { + "display schema diff as tree with different depth" in { val s1 = StructType( Seq( StructField("array", ArrayType(StringType, containsNull = true), true), @@ -308,7 +308,7 @@ class SchemaComparerTest extends AnyFreeSpec { } val expectedMessage = """Diffs | - |Actual Schema Expected Schema + |Actual Schema Expected Schema |\u001b[90m|--\u001b[39m \u001b[90marray\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[90marray\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) |\u001b[90m|--\u001b[39m \u001b[31mmap\u001b[39m : \u001b[31mmap\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[32msomething\u001b[39m : \u001b[32mstring\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) |\u001b[90m|--\u001b[39m \u001b[31msomething\u001b[39m : \u001b[31mstring\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[32mmap\u001b[39m : \u001b[32mmap\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) @@ -319,6 +319,235 @@ class SchemaComparerTest extends AnyFreeSpec { |\u001b[90m| | |--\u001b[39m \u001b[31mmood2\u001b[39m : \u001b[31marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m| | |--\u001b[39m \u001b[32mmood3\u001b[39m : \u001b[32marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) |\u001b[31m| | |--\u001b[39m \u001b[31msomething2\u001b[39m : \u001b[90mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[32m|--\u001b[39m \u001b[32mnorma2\u001b[39m : \u001b[90mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) |""".stripMargin + + assert(e.getMessage == expectedMessage) + } + + "display schema diff for wide tree" in { + val s1 = StructType( + Seq( + StructField("array", ArrayType(StringType, containsNull = true), true), + StructField("map", MapType(StringType, StringType, valueContainsNull = false), true), + StructField("something", StringType, true), + StructField( + "struct", + StructType( + StructType( + Seq( + StructField("mood", ArrayType(StringType, containsNull = false), true), + StructField("something", StringType, false), + StructField( + "something2", + StructType( + Seq( + StructField("mood2", ArrayType(DoubleType, containsNull = false), true), + StructField( + "something2", + StructType( + Seq( + StructField("mood", ArrayType(StringType, containsNull = false), true), + StructField("something", StringType, false), + StructField( + "something2", + StructType( + Seq( + StructField("mood2", ArrayType(DoubleType, containsNull = false), true), + StructField("something2", StringType, false) + ) + ), + false + ) + ) + ), + false + ) + ) + ), + false + ) + ) + ) + ), + true + ) + ) + ) + val s2 = StructType( + Seq( + StructField("array", ArrayType(StringType, containsNull = true), true), + StructField("something", StringType, true), + StructField("map", MapType(StringType, StringType, valueContainsNull = false), true), + StructField( + "struct", + StructType( + StructType( + Seq( + StructField("something", StringType, false), + StructField("mood", ArrayType(StringType, containsNull = false), true), + StructField( + "something3", + StructType( + Seq( + StructField("mood2", ArrayType(DoubleType, containsNull = false), true), + StructField( + "something2", + StructType( + Seq( + StructField("mood", ArrayType(StringType, containsNull = false), true), + StructField("something", StringType, false), + StructField( + "something2", + StructType( + Seq( + StructField("mood2", ArrayType(DoubleType, containsNull = false), true), + StructField("something2", StringType, false) + ) + ), + false + ) + ) + ), + false + ) + ) + ), + false + ) + ) + ) + ), + true + ), + StructField("norma2", StringType, false) + ) + ) + + val e = intercept[DatasetSchemaMismatch] { + SchemaComparer.assertSchemaEqual(s1, s2, ignoreColumnOrder = false, outputFormat = SchemaDiffOutputFormat.Tree) + } + val expectedMessage = """Diffs + | + |Actual Schema Expected Schema + |\u001b[90m|--\u001b[39m \u001b[90marray\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[90marray\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m|--\u001b[39m \u001b[31mmap\u001b[39m : \u001b[31mmap\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[32msomething\u001b[39m : \u001b[32mstring\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m|--\u001b[39m \u001b[31msomething\u001b[39m : \u001b[31mstring\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[32mmap\u001b[39m : \u001b[32mmap\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m|--\u001b[39m \u001b[90mstruct\u001b[39m : \u001b[31mstruct\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[90mstruct\u001b[39m : \u001b[32mstruct\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m| |--\u001b[39m \u001b[31mmood\u001b[39m : \u001b[31marray\u001b[39m (nullable = \u001b[31mtrue\u001b[39m) \u001b[90m| |--\u001b[39m \u001b[32msomething\u001b[39m : \u001b[32mstring\u001b[39m (nullable = \u001b[32mfalse\u001b[39m) + |\u001b[90m| |--\u001b[39m \u001b[31msomething\u001b[39m : \u001b[31mstring\u001b[39m (nullable = \u001b[31mfalse\u001b[39m) \u001b[90m| |--\u001b[39m \u001b[32mmood\u001b[39m : \u001b[32marray\u001b[39m (nullable = \u001b[32mtrue\u001b[39m) + |\u001b[90m| |--\u001b[39m \u001b[31msomething2\u001b[39m : \u001b[90mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[90m| |--\u001b[39m \u001b[32msomething3\u001b[39m : \u001b[90mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) + |\u001b[90m| | |--\u001b[39m \u001b[90mmood2\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m| | |--\u001b[39m \u001b[90mmood2\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m| | |--\u001b[39m \u001b[90msomething2\u001b[39m : \u001b[90mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[90m| | |--\u001b[39m \u001b[90msomething2\u001b[39m : \u001b[90mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) + |\u001b[90m| | | |--\u001b[39m \u001b[90mmood\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m| | | |--\u001b[39m \u001b[90mmood\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m| | | |--\u001b[39m \u001b[90msomething\u001b[39m : \u001b[90mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[90m| | | |--\u001b[39m \u001b[90msomething\u001b[39m : \u001b[90mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) + |\u001b[90m| | | |--\u001b[39m \u001b[90msomething2\u001b[39m : \u001b[90mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[90m| | | |--\u001b[39m \u001b[90msomething2\u001b[39m : \u001b[90mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) + |\u001b[90m| | | | |--\u001b[39m \u001b[90mmood2\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m| | | | |--\u001b[39m \u001b[90mmood2\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m| | | | |--\u001b[39m \u001b[90msomething2\u001b[39m : \u001b[90mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[90m| | | | |--\u001b[39m \u001b[90msomething2\u001b[39m : \u001b[90mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) + | \u001b[90m|--\u001b[39m \u001b[32mnorma2\u001b[39m : \u001b[32mstring\u001b[39m (nullable = \u001b[32mfalse\u001b[39m) + |""".stripMargin + + assert(e.getMessage == expectedMessage) + } + + "display schema diff as tree with more actual Column 2" in { + val s1 = StructType( + Seq( + StructField("array", ArrayType(StringType, containsNull = true), true), + StructField("map", MapType(StringType, StringType, valueContainsNull = false), true), + StructField("something", StringType, true), + StructField( + "struct", + StructType( + StructType( + Seq( + StructField("mood", ArrayType(StringType, containsNull = false), true), + StructField("something", StringType, false), + StructField( + "something2", + StructType( + Seq( + StructField("mood2", ArrayType(DoubleType, containsNull = false), true), + StructField( + "something2", + StructType( + Seq( + StructField("mood2", ArrayType(DoubleType, containsNull = false), true), + StructField( + "something2", + StructType( + Seq( + StructField("mood2", ArrayType(DoubleType, containsNull = false), true), + StructField("something2", StringType, false) + ) + ), + false + ) + ) + ), + false + ) + ) + ), + false + ) + ) + ) + ), + true + ) + ) + ) + val s2 = StructType( + Seq( + StructField("array", ArrayType(StringType, containsNull = true), true), + StructField("something", StringType, true), + StructField( + "struct", + StructType( + StructType( + Seq( + StructField("something", StringType, false), + StructField("mood", ArrayType(StringType, containsNull = false), true), + StructField( + "something3", + StructType( + Seq( + StructField("mood3", ArrayType(StringType, containsNull = false), true) + ) + ), + false + ) + ) + ) + ), + true + ) + ) + ) + + val e = intercept[DatasetSchemaMismatch] { + SchemaComparer.assertSchemaEqual(s1, s2, ignoreColumnOrder = false, outputFormat = SchemaDiffOutputFormat.Tree) + } + + val expectedMessage = """Diffs + | + |Actual Schema Expected Schema + |\u001b[90m|--\u001b[39m \u001b[90marray\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[90marray\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m|--\u001b[39m \u001b[31mmap\u001b[39m : \u001b[31mmap\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[32msomething\u001b[39m : \u001b[32mstring\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m|--\u001b[39m \u001b[31msomething\u001b[39m : \u001b[31mstring\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m|--\u001b[39m \u001b[32mstruct\u001b[39m : \u001b[32mstruct\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[31m|--\u001b[39m \u001b[31mstruct\u001b[39m : \u001b[31mstruct\u001b[39m (nullable = \u001b[31mtrue\u001b[39m) \u001b[32m| |--\u001b[39m \u001b[32msomething\u001b[39m : \u001b[32mstring\u001b[39m (nullable = \u001b[32mfalse\u001b[39m) + |\u001b[90m| |--\u001b[39m \u001b[90mmood\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) \u001b[90m| |--\u001b[39m \u001b[90mmood\u001b[39m : \u001b[90marray\u001b[39m (nullable = \u001b[90mtrue\u001b[39m) + |\u001b[90m| |--\u001b[39m \u001b[31msomething\u001b[39m : \u001b[31mstring\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) \u001b[90m| |--\u001b[39m \u001b[32msomething3\u001b[39m : \u001b[32mstruct\u001b[39m (nullable = \u001b[90mfalse\u001b[39m) + |\u001b[31m| |--\u001b[39m \u001b[31msomething2\u001b[39m : \u001b[31mstruct\u001b[39m (nullable = \u001b[31mfalse\u001b[39m) \u001b[32m| | |--\u001b[39m \u001b[32mmood3\u001b[39m : \u001b[32marray\u001b[39m (nullable = \u001b[32mtrue\u001b[39m) + |\u001b[31m| | |--\u001b[39m \u001b[31mmood2\u001b[39m : \u001b[31marray\u001b[39m (nullable = \u001b[31mtrue\u001b[39m) + |\u001b[31m| | |--\u001b[39m \u001b[31msomething2\u001b[39m : \u001b[31mstruct\u001b[39m (nullable = \u001b[31mfalse\u001b[39m) + |\u001b[31m| | | |--\u001b[39m \u001b[31mmood2\u001b[39m : \u001b[31marray\u001b[39m (nullable = \u001b[31mtrue\u001b[39m) + |\u001b[31m| | | |--\u001b[39m \u001b[31msomething2\u001b[39m : \u001b[31mstruct\u001b[39m (nullable = \u001b[31mfalse\u001b[39m) + |\u001b[31m| | | | |--\u001b[39m \u001b[31mmood2\u001b[39m : \u001b[31marray\u001b[39m (nullable = \u001b[31mtrue\u001b[39m) + |\u001b[31m| | | | |--\u001b[39m \u001b[31msomething2\u001b[39m : \u001b[31mstring\u001b[39m (nullable = \u001b[31mfalse\u001b[39m) + |""".stripMargin + val actual = e.getMessage + println(actual) + println(expectedMessage) assert(e.getMessage == expectedMessage) } } From b79adc4aa450f9315f1493ffa83f9dca00118d16 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Tue, 19 Nov 2024 22:00:55 +1100 Subject: [PATCH 4/5] put magic number in constant --- .../github/mrpowers/spark/fast/tests/SchemaComparer.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala index 7a003f4..bd3aa09 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala @@ -7,6 +7,8 @@ import org.apache.spark.sql.Dataset import org.apache.spark.sql.types._ object SchemaComparer { + private val INDENT_GAP = 5 + private val DESCRIPTION_GAP = 21 case class DatasetSchemaMismatch(smth: String) extends Exception(smth) private def betterSchemaMismatchMessage(actualSchema: StructType, expectedSchema: StructType): String = { showProductDiff( @@ -20,8 +22,7 @@ object SchemaComparer { private def treeSchemaMismatchMessage[T](actualSchema: StructType, expectedSchema: StructType): String = { def flattenStrucType(s: StructType, indent: Int): (Seq[(Int, StructField)], Int) = s .foldLeft((Seq.empty[(Int, StructField)], Int.MinValue)) { case ((fieldPair, maxWidth), f) => - // 5 char for each level of indentation, 21 char for gap, and description words - val gap = indent * 5 + 21 + f.name.length + f.dataType.typeName.length + f.nullable.toString.length + val gap = indent * INDENT_GAP + DESCRIPTION_GAP + f.name.length + f.dataType.typeName.length + f.nullable.toString.length val pair = fieldPair :+ (indent, f) val newMaxWidth = scala.math.max(maxWidth, gap) f.dataType match { From 94a5f9cef5a4f081f563439f0bfe57026cb4fa99 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Wed, 20 Nov 2024 19:20:43 +1100 Subject: [PATCH 5/5] Add TREE_GAP constant --- .../github/mrpowers/spark/fast/tests/SchemaComparer.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala index bd3aa09..32ceaf7 100644 --- a/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala +++ b/core/src/main/scala/com/github/mrpowers/spark/fast/tests/SchemaComparer.scala @@ -9,6 +9,7 @@ import org.apache.spark.sql.types._ object SchemaComparer { private val INDENT_GAP = 5 private val DESCRIPTION_GAP = 21 + private val TREE_GAP = 6 case class DatasetSchemaMismatch(smth: String) extends Exception(smth) private def betterSchemaMismatchMessage(actualSchema: StructType, expectedSchema: StructType): String = { showProductDiff( @@ -34,7 +35,6 @@ object SchemaComparer { } def depthToIndentStr(depth: Int): String = Range(0, depth).map(_ => "| ").mkString + "|--" - val treeSpaces = 6 val (treeFieldPair1, tree1MaxWidth) = flattenStrucType(actualSchema, 0) val (treeFieldPair2, _) = flattenStrucType(expectedSchema, 0) val (treePair, maxWidth) = treeFieldPair1 @@ -89,8 +89,8 @@ object SchemaComparer { (acc :+ pair, math.max(maxWidth, pair._1.length)) } - val schemaGap = maxWidth + treeSpaces - val headerGap = tree1MaxWidth + treeSpaces + val schemaGap = maxWidth + TREE_GAP + val headerGap = tree1MaxWidth + TREE_GAP treePair .foldLeft(new StringBuilder("\nActual Schema".padTo(headerGap, ' ') + "Expected Schema\n")) { case (sb, (s1, s2)) => val gap = if (s1.isEmpty) headerGap else schemaGap