georgia-tech-db · gaurav274 · Sep 26, 2023 · Sep 24, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -56,7 +56,18 @@ workflows:
                 ignore: 
                   - master
                   - staging
-
+        ################################
+        #### LONG INTEGRATION TESTS: PR
+        ################################
+        ################################
+        - Linux:
+            name: Long Integration Test (Cache) | v3.10 | Linux
+            mode: LONG INTEGRATION CACHE
+            filters:
+              branches:
+                ignore: 
+                  - master
+                  - staging
         ################################
         #### SHORT THIRDPARTY TESTS: PR
         ################################
@@ -201,6 +212,11 @@ jobs:
       - restore_cache:
           keys:
             - v1-model_cache-{{ checksum "setup.py" }}
+
+      # Always restore testmondata from staging, python3.10, ray disabled.
+      - restore_cache:
+          keys:
+            - v1-testmon_cache-staging-python3.10-rayDISABLED-{{ checksum "setup.py" }}
 
       - run:
           name: Install EvaDB package from GitHub repo with all dependencies
@@ -251,6 +267,12 @@ jobs:
                   - /home/circleci/.cache/torch/
                   - /home/circleci/.cache/gpt4all/
 
+    # Collect the testmondata only on the staging branch 
+      - save_cache:
+          key: v1-testmon_cache-{{ .Branch }}-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}
+          paths:
+            - .testmondata
+
       - save_cache:
           key: v1-pip-wheel_cache-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}
           paths:

diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py
@@ -336,6 +336,14 @@ def try_to_import_sklearn():
         )
 
 
+def is_sklearn_available() -> bool:
+    try:
+        try_to_import_sklearn()
+        return True
+    except ValueError:  # noqa: E722
+        return False
+
+
 ##############################
 ## VISION
 ##############################

diff --git a/script/test/test.sh b/script/test/test.sh
@@ -82,7 +82,13 @@ short_third_party_test(){
 }
 
 long_integration_test() {
-  PYTHONPATH=./ python -m pytest test/integration_tests/long/ -p no:cov -m "not benchmark"
+  cache=$1
+  if [[ "$cache" = "WITH_CACHE" ]];
+  then
+    PYTHONPATH=./ python -m pytest -ra --testmon-forceselect test/integration_tests/long/ -p no:cov -m "not benchmark"
+  else
+    PYTHONPATH=./ python -m pytest -ra --testmon-noselect test/integration_tests/long/ -p no:cov -m "not benchmark"
+  fi
   code=$?
   print_error_code $code "LONG INTEGRATION TEST"
 }
@@ -194,8 +200,13 @@ fi
 ##################################################
 
 if [[ "$MODE" = "LONG INTEGRATION" ]];
-then 
-  long_integration_test
+then
+  long_integration_test "WITHOUT_CACHE"
+fi
+
+if [[ "$MODE" = "LONG INTEGRATION CACHE" ]];
+then
+  long_integration_test "WITH_CACHE"
 fi
 
 ##################################################

diff --git a/setup.py b/setup.py
@@ -118,6 +118,8 @@ def read(path, encoding="utf-8"):
 
 ludwig_libs = ["ludwig[hyperopt,distributed]"]  # MODEL TRAIN AND FINE TUNING
 
+sklearn_libs = ["scikit-learn"]
+
 forecasting_libs = [
     "statsforecast" # MODEL TRAIN AND FINE TUNING
 ]
@@ -131,6 +133,7 @@ def read(path, encoding="utf-8"):
     "mock",
     "coveralls>=3.0.1",
     "moto[s3]>=4.1.1",
+    "pytest-testmon",
     # BENCHMARK PACKAGES
     "pytest-benchmark",
     # LINTING PACKAGES
@@ -160,9 +163,10 @@ def read(path, encoding="utf-8"):
     "chromadb": chromadb_libs,
     "postgres": postgres_libs,
     "ludwig": ludwig_libs,
+    "sklearn": sklearn_libs,
     "forecasting": forecasting_libs,
     # everything except ray, qdrant, ludwig and postgres. The first three fail on pyhton 3.11.
-    "dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs,
+    "dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs + sklearn_libs,
 }
 
 setup(

diff --git a/test/integration_tests/long/test_model_train.py b/test/integration_tests/long/test_model_train.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from test.markers import ludwig_skip_marker
+from test.markers import ludwig_skip_marker, sklearn_skip_marker
 from test.util import get_evadb_for_testing, shutdown_ray
 
 import pytest
@@ -53,11 +53,17 @@ def tearDownClass(cls):
 
         # clean up
         execute_query_fetch_all(cls.evadb, "DROP TABLE IF EXISTS HomeRentals;")
+        execute_query_fetch_all(
+            cls.evadb, "DROP FUNCTION IF EXISTS PredictHouseRentLudwig;"
+        )
+        execute_query_fetch_all(
+            cls.evadb, "DROP FUNCTION IF EXISTS PredictHouseRentSklearn;"
+        )
 
     @ludwig_skip_marker
     def test_ludwig_automl(self):
         create_predict_function = """
-            CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM
+            CREATE OR REPLACE FUNCTION PredictHouseRentLudwig FROM
             ( SELECT * FROM HomeRentals )
             TYPE Ludwig
             PREDICT 'rental_price'
@@ -66,23 +72,24 @@ def test_ludwig_automl(self):
         execute_query_fetch_all(self.evadb, create_predict_function)
 
         predict_query = """
-            SELECT PredictHouseRent(*) FROM HomeRentals LIMIT 10;
+            SELECT PredictHouseRentLudwig(*) FROM HomeRentals LIMIT 10;
         """
         result = execute_query_fetch_all(self.evadb, predict_query)
         self.assertEqual(len(result.columns), 1)
         self.assertEqual(len(result), 10)
 
+    @sklearn_skip_marker
     def test_sklearn_regression(self):
         create_predict_function = """
-            CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM
+            CREATE OR REPLACE FUNCTION PredictHouseRentSklearn FROM
             ( SELECT number_of_rooms, number_of_bathrooms, days_on_market, rental_price FROM HomeRentals )
             TYPE Sklearn
             PREDICT 'rental_price';
         """
         execute_query_fetch_all(self.evadb, create_predict_function)
 
         predict_query = """
-            SELECT PredictHouseRent(number_of_rooms, number_of_bathrooms, days_on_market, rental_price) FROM HomeRentals LIMIT 10;
+            SELECT PredictHouseRentSklearn(number_of_rooms, number_of_bathrooms, days_on_market, rental_price) FROM HomeRentals LIMIT 10;
         """
         result = execute_query_fetch_all(self.evadb, predict_query)
         self.assertEqual(len(result.columns), 1)

diff --git a/test/markers.py b/test/markers.py
@@ -25,6 +25,7 @@
     is_ludwig_available,
     is_pinecone_available,
     is_qdrant_available,
+    is_sklearn_available,
 )
 
 asyncio_skip_marker = pytest.mark.skipif(
@@ -83,6 +84,10 @@
     is_ludwig_available() is False, reason="Run only if ludwig is available"
 )
 
+sklearn_skip_marker = pytest.mark.skipif(
+    is_sklearn_available() is False, reason="Run only if sklearn is available"
+)
+
 chatgpt_skip_marker = pytest.mark.skip(
     reason="requires chatgpt",
 )