From dfa3bdbe6c3f56ab3e1c489ca7cd5d67627fa75b Mon Sep 17 00:00:00 2001 From: Noah Botimer Date: Wed, 29 May 2024 13:46:19 -0400 Subject: [PATCH] Set up Spark container for DB sync - Move the migrate.sc to sync directory - Add Dockerfile to build Spark container with drivers The idea here is to build an image that has the drivers and script already packaged, ready for immediate use in a workflow. --- db/README.md | 2 +- db/sync/Dockerfile | 10 +++++++++ db/sync/spark-defaults.conf | 2 ++ db/{migrate.sc => sync/sync.sc} | 39 ++++++++++++++++++++++++--------- 4 files changed, 42 insertions(+), 11 deletions(-) create mode 100644 db/sync/Dockerfile create mode 100644 db/sync/spark-defaults.conf rename db/{migrate.sc => sync/sync.sc} (55%) diff --git a/db/README.md b/db/README.md index 5faba0c1..be2407be 100644 --- a/db/README.md +++ b/db/README.md @@ -7,7 +7,7 @@ There are raw SQL files here to set up the legacy schema in MariaDB/MySQL: - [keys.sql](keys.sql) - Create foreign keys - [test-fixture.sql](test-fixture.sql) - Load test site data - [drop-keys.sql](drop-keys.sql) - Drop foreign keys; to allow for truncate/reload -- [migrate.sc](migrate.sc) - A Spark script to migrate data from Oracle to MariaDB +- [sync/sync.sc](sync/sync.sc) - A Spark script to migrate data from Oracle to MariaDB ## Local setup with Docker Compose diff --git a/db/sync/Dockerfile b/db/sync/Dockerfile new file mode 100644 index 00000000..ae37c920 --- /dev/null +++ b/db/sync/Dockerfile @@ -0,0 +1,10 @@ +FROM spark:3.4.1-scala + +ENV PATH="$PATH:/opt/spark/bin" + +# Configure Spark and cache the database drivers +RUN mkdir /opt/spark/conf +COPY spark-defaults.conf /opt/spark/conf +RUN echo :quit | spark-shell + +COPY sync.sc . diff --git a/db/sync/spark-defaults.conf b/db/sync/spark-defaults.conf new file mode 100644 index 00000000..dad99706 --- /dev/null +++ b/db/sync/spark-defaults.conf @@ -0,0 +1,2 @@ +spark.jars.ivy /opt/spark/jars +spark.jars.packages com.oracle.database.jdbc:ojdbc11:21.14.0.0,org.mariadb.jdbc:mariadb-java-client:3.4.0 diff --git a/db/migrate.sc b/db/sync/sync.sc similarity index 55% rename from db/migrate.sc rename to db/sync/sync.sc index 9235b474..8ca1c552 100644 --- a/db/migrate.sc +++ b/db/sync/sync.sc @@ -3,16 +3,35 @@ // As of Spark 3.2.1, we must use the MySQL Connector/J to connect to MariaDB // because of some dialect / datatype issues. // -// This was used with Spark 3.2.1, via spark-shell. -// Depends on MySQL Connector/J and Oracle JDBC (ojdbc11) driver. +// As of Spark 3.4.1, the MariaDB connector works, as long as the permitMysqlScheme +// parameter is used on the connection. // -// The connection strings must be set in the ORACLE_DB_URL and MYSQL_DB_URL environment variables. +// Spark 3.5.1 has some problem with a plan string buffer size limit, apparently +// related to an upgrade to the Jackson version. It looks to be configurable. // -// The packages can be downloaded via Maven and this script can be run "immediately": +// This works with Spark 3.4.1 (-scala image variants), via spark-shell. +// Depends on MariaDB connector and Oracle JDBC (ojdbc11) driver. +// +// The connection strings must be set in the ORACLE_URL and MARIADB_URL environment variables. // -// spark-shell --packages com.oracle.database.jdbc:ojdbc11:21.5.0.0,mysql:mysql-connector-java:8.0.29 -I migrate.sc +// If running in a container built with the companion Dockerfile, the conf/spark-defaults.conf +// already sets the the Ivy directory and specifies the packages, and the build +// will have downloaded them to the cache, with spark-shell on the PATH. If not: // +// The packages can be downloaded via Maven and this script can be run "immediately": +// +// spark-shell --packages com.oracle.database.jdbc:ojdbc11:21.5.0.0,org.mariadb.jdbc:mariadb-java-client:3.3.3 -I sync.sc +// +// When running in a container based on the official Spark image, you will need to +// specify a directory that can be written/created for the Maven/Ivy downloads, e.g.: +// +// spark-shell --conf "spark.jars.ivy=/opt/spark/work-dir/ivy" ... +// // This should leave you in the Spark shell, which you can exit with `exit` or Ctrl-d. +// +// Another approach is to pipe :exit into spark-shell to terminate automatically: +// +// echo :exit | spark-shell -I sync.sc import java.util.Properties import org.apache.spark.sql.DataFrame import org.apache.spark.sql.DataFrameReader @@ -21,12 +40,12 @@ import org.apache.spark.sql.DataFrameWriter // Oracle connection string, e.g., // jdbc:oracle:thin:/@:1521: // jdbc:oracle:thin:somebody/pw@localhost:1521:orcl -val ourl = scala.util.Properties.envOrElse("ORACLE_DB_URL", "ORACLE_DB_URL not set in env") +val ourl = scala.util.Properties.envOrElse("ORACLE_URL", "ORACLE_URL not set in env") // MariaDB/MySQL connection string, e.g., -// jdbc:mysql://:3306/?user=&password= -// jdbc:mysql://localhsot:3306/authz_umichlib?user=somebody&password=pw -val murl = scala.util.Properties.envOrElse("MYSQL_DB_URL", "MYSQL_DB_URL not set in env") +// jdbc:mysql://:3306/?user=&password=&permitMysqlScheme +// jdbc:mysql://localhost:3306/authz_umichlib?user=somebody&password=pw&permitMysqlScheme +val murl = scala.util.Properties.envOrElse("MARIADB_URL", "MARIADB_URL not set in env") // Read a table from Oracle. // This builds a DataFrameReader, then loads it to give a DataFrame. @@ -43,7 +62,7 @@ def oread(table:String) : DataFrame = { // The config here expects the table to exist and uses TRUNCATE rather than DROP. def mwrite(df:DataFrame, table:String) = { df.write.format("jdbc") - .option("driver", "com.mysql.cj.jdbc.Driver") + .option("driver", "org.mariadb.jdbc.Driver") .option("url", murl) .option("dbtable", table) .option("sessionInitStatement", "SET FOREIGN_KEY_CHECKS=0")