From dfa3bdbe6c3f56ab3e1c489ca7cd5d67627fa75b Mon Sep 17 00:00:00 2001
From: Noah Botimer <botimer@umich.edu>
Date: Wed, 29 May 2024 13:46:19 -0400
Subject: [PATCH] Set up Spark container for DB sync

- Move the migrate.sc to sync directory
- Add Dockerfile to build Spark container with drivers

The idea here is to build an image that has the drivers and script
already packaged, ready for immediate use in a workflow.
---
 db/README.md                    |  2 +-
 db/sync/Dockerfile              | 10 +++++++++
 db/sync/spark-defaults.conf     |  2 ++
 db/{migrate.sc => sync/sync.sc} | 39 ++++++++++++++++++++++++---------
 4 files changed, 42 insertions(+), 11 deletions(-)
 create mode 100644 db/sync/Dockerfile
 create mode 100644 db/sync/spark-defaults.conf
 rename db/{migrate.sc => sync/sync.sc} (55%)

diff --git a/db/README.md b/db/README.md
index 5faba0c1..be2407be 100644
--- a/db/README.md
+++ b/db/README.md
@@ -7,7 +7,7 @@ There are raw SQL files here to set up the legacy schema in MariaDB/MySQL:
 - [keys.sql](keys.sql) - Create foreign keys
 - [test-fixture.sql](test-fixture.sql) - Load test site data
 - [drop-keys.sql](drop-keys.sql) - Drop foreign keys; to allow for truncate/reload
-- [migrate.sc](migrate.sc) - A Spark script to migrate data from Oracle to MariaDB
+- [sync/sync.sc](sync/sync.sc) - A Spark script to migrate data from Oracle to MariaDB
 
 ## Local setup with Docker Compose
 
diff --git a/db/sync/Dockerfile b/db/sync/Dockerfile
new file mode 100644
index 00000000..ae37c920
--- /dev/null
+++ b/db/sync/Dockerfile
@@ -0,0 +1,10 @@
+FROM spark:3.4.1-scala
+
+ENV PATH="$PATH:/opt/spark/bin"
+
+# Configure Spark and cache the database drivers
+RUN mkdir /opt/spark/conf
+COPY spark-defaults.conf /opt/spark/conf
+RUN echo :quit | spark-shell
+
+COPY sync.sc .
diff --git a/db/sync/spark-defaults.conf b/db/sync/spark-defaults.conf
new file mode 100644
index 00000000..dad99706
--- /dev/null
+++ b/db/sync/spark-defaults.conf
@@ -0,0 +1,2 @@
+spark.jars.ivy      /opt/spark/jars
+spark.jars.packages com.oracle.database.jdbc:ojdbc11:21.14.0.0,org.mariadb.jdbc:mariadb-java-client:3.4.0
diff --git a/db/migrate.sc b/db/sync/sync.sc
similarity index 55%
rename from db/migrate.sc
rename to db/sync/sync.sc
index 9235b474..8ca1c552 100644
--- a/db/migrate.sc
+++ b/db/sync/sync.sc
@@ -3,16 +3,35 @@
 // As of Spark 3.2.1, we must use the MySQL Connector/J to connect to MariaDB
 // because of some dialect / datatype issues.
 //
-// This was used with Spark 3.2.1, via spark-shell.
-// Depends on MySQL Connector/J and Oracle JDBC (ojdbc11) driver.
+// As of Spark 3.4.1, the MariaDB connector works, as long as the permitMysqlScheme
+// parameter is used on the connection.
 //
-// The connection strings must be set in the ORACLE_DB_URL and MYSQL_DB_URL environment variables.
+// Spark 3.5.1 has some problem with a plan string buffer size limit, apparently
+// related to an upgrade to the Jackson version. It looks to be configurable.
 //
-// The packages can be downloaded via Maven and this script can be run "immediately":
+// This works with Spark 3.4.1 (-scala image variants), via spark-shell.
+// Depends on MariaDB connector and Oracle JDBC (ojdbc11) driver.
+//
+// The connection strings must be set in the ORACLE_URL and MARIADB_URL environment variables.
 //
-//   spark-shell --packages com.oracle.database.jdbc:ojdbc11:21.5.0.0,mysql:mysql-connector-java:8.0.29 -I migrate.sc
+// If running in a container built with the companion Dockerfile, the conf/spark-defaults.conf
+// already sets the the Ivy directory and specifies the packages, and the build
+// will have downloaded them to the cache, with spark-shell on the PATH. If not:
 // 
+// The packages can be downloaded via Maven and this script can be run "immediately":
+//
+//   spark-shell --packages com.oracle.database.jdbc:ojdbc11:21.5.0.0,org.mariadb.jdbc:mariadb-java-client:3.3.3 -I sync.sc
+//
+// When running in a container based on the official Spark image, you will need to
+// specify a directory that can be written/created for the Maven/Ivy downloads, e.g.:
+//
+//   spark-shell --conf "spark.jars.ivy=/opt/spark/work-dir/ivy" ...
+//
 // This should leave you in the Spark shell, which you can exit with `exit` or Ctrl-d.
+//
+// Another approach is to pipe :exit into spark-shell to terminate automatically:
+//
+//   echo :exit | spark-shell -I sync.sc
 import java.util.Properties
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.DataFrameReader
@@ -21,12 +40,12 @@ import org.apache.spark.sql.DataFrameWriter
 // Oracle connection string, e.g.,
 // jdbc:oracle:thin:<username>/<password>@<hostname>:1521:<sid>
 // jdbc:oracle:thin:somebody/pw@localhost:1521:orcl
-val ourl = scala.util.Properties.envOrElse("ORACLE_DB_URL", "ORACLE_DB_URL not set in env")
+val ourl = scala.util.Properties.envOrElse("ORACLE_URL", "ORACLE_URL not set in env")
 
 // MariaDB/MySQL connection string, e.g.,
-// jdbc:mysql://<hostname>:3306/<database>?user=<username>&password=<password>
-// jdbc:mysql://localhsot:3306/authz_umichlib?user=somebody&password=pw
-val murl = scala.util.Properties.envOrElse("MYSQL_DB_URL", "MYSQL_DB_URL not set in env")
+// jdbc:mysql://<hostname>:3306/<database>?user=<username>&password=<password>&permitMysqlScheme
+// jdbc:mysql://localhost:3306/authz_umichlib?user=somebody&password=pw&permitMysqlScheme
+val murl = scala.util.Properties.envOrElse("MARIADB_URL", "MARIADB_URL not set in env")
 
 // Read a table from Oracle.
 // This builds a DataFrameReader, then loads it to give a DataFrame.
@@ -43,7 +62,7 @@ def oread(table:String) : DataFrame = {
 // The config here expects the table to exist and uses TRUNCATE rather than DROP.
 def mwrite(df:DataFrame, table:String) = {
     df.write.format("jdbc")
-        .option("driver", "com.mysql.cj.jdbc.Driver")
+        .option("driver", "org.mariadb.jdbc.Driver")
         .option("url", murl)
         .option("dbtable", table)
         .option("sessionInitStatement", "SET FOREIGN_KEY_CHECKS=0")