improve_execution_time_in_kaggle_loop

microsoft · Sep 19, 2024 · 8dd3c8c · 8dd3c8c
1 parent 3535889
commit 8dd3c8c
Show file tree

Hide file tree

Showing 14 changed files with 140 additions and 74 deletions.
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
@@ -87,19 +87,6 @@ def __init__(
         self.executed_factor_value_dataframe = executed_factor_value_dataframe
         self.raise_exception = raise_exception
 
-    @staticmethod
-    def link_data_to_workspace(data_path: Path, workspace_path: Path):
-        data_path = Path(data_path)
-        workspace_path = Path(workspace_path)
-        for data_file_path in data_path.iterdir():
-            workspace_data_file_path = workspace_path / data_file_path.name
-            if workspace_data_file_path.exists():
-                workspace_data_file_path.unlink()
-            subprocess.run(
-                ["ln", "-s", data_file_path, workspace_data_file_path],
-                check=False,
-            )
-
     def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]:
         """
         execute the implementation and get the factor value by the following steps:

diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template.txt
@@ -4,8 +4,8 @@ import numpy as np
 import pandas as pd
 from factor import feature_engineering_cls
 
-if os.path.exists("valid.pkl"):
-    valid_df = pd.read_pickle("valid.pkl")
+if os.path.exists("X_valid.pkl"):
+    valid_df = pd.read_pickle("X_valid.pkl").head(1000)
 else:
     raise FileNotFoundError("No valid data found.")
 

diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import shutil
+import subprocess
 import uuid
 from abc import ABC, abstractmethod
 from copy import deepcopy
@@ -111,6 +112,19 @@ def prepare(self) -> None:
         """
         self.workspace_path.mkdir(parents=True, exist_ok=True)
 
+    @staticmethod
+    def link_data_to_workspace(data_path: Path, workspace_path: Path):
+        data_path = Path(data_path)
+        workspace_path = Path(workspace_path)
+        for data_file_path in data_path.iterdir():
+            workspace_data_file_path = workspace_path / data_file_path.name
+            if workspace_data_file_path.exists():
+                workspace_data_file_path.unlink()
+            subprocess.run(
+                ["ln", "-s", data_file_path, workspace_data_file_path],
+                check=False,
+            )
+
     def inject_code(self, **files: str) -> None:
         """
         Inject the code into the folder.

diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py
@@ -35,14 +35,18 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
         super().__init__(scen)
 
     def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
-        hypothesis_feedback = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["hypothesis_and_feedback"])
-            .render(trace=trace)
+        hypothesis_and_feedback = (
+            (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["hypothesis_and_feedback"])
+                .render(trace=trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
         )
         context_dict = {
-            "hypothesis_and_feedback": hypothesis_feedback,
-            "RAG": "",
+            "hypothesis_and_feedback": hypothesis_and_feedback,
+            "RAG": None,
             "hypothesis_output_format": prompt_dict["hypothesis_output_format"],
             "hypothesis_specification": prompt_dict["model_hypothesis_specification"],
         }
@@ -67,9 +71,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
         experiment_output_format = prompt_dict["model_experiment_output_format"]
 
         hypothesis_and_feedback = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["hypothesis_and_feedback"])
-            .render(trace=trace)
+            (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["hypothesis_and_feedback"])
+                .render(trace=trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
         )
 
         experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
@@ -84,7 +92,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             "hypothesis_and_feedback": hypothesis_and_feedback,
             "experiment_output_format": experiment_output_format,
             "target_list": model_list,
-            "RAG": ...,
+            "RAG": None,
         }, True
 
     def convert_response(self, response: str, trace: Trace) -> ModelExperiment:

diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py
@@ -2,10 +2,8 @@
 
 import numpy as np
 import pandas as pd
-import xgboost as xgb
-from sklearn.metrics import accuracy_score, matthews_corrcoef
 from sklearn.model_selection import KFold
-from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+from sklearn.preprocessing import LabelEncoder
 
 from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess
 

diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
@@ -1,3 +1,4 @@
+import os
 import pandas as pd
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
@@ -82,6 +83,15 @@ def preprocess_script():
     """
     This method applies the preprocessing steps to the training, validation, and test datasets.
     """
+    if os.path.exists("X_train.pkl"):
+        X_train = pd.read_pickle("X_train.pkl")
+        X_valid = pd.read_pickle("X_valid.pkl")
+        y_train = pd.read_pickle("y_train.pkl")
+        y_valid = pd.read_pickle("y_valid.pkl")
+        X_test = pd.read_pickle("X_test.pkl")
+        passenger_ids = pd.read_pickle("passenger_ids.pkl")
+
+        return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
     # Fit the preprocessor on the training data

diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_rf.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_rf.py
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
-    model = RandomForestClassifier(n_estimators=100, random_state=32)
+    model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
 
     # Select features (if any feature selection is needed)
     X_train_selected = select(X_train)

diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py
@@ -6,21 +6,24 @@
 import xgboost as xgb
 
 
-def select(X):
-    """
-    Select relevant features. To be used in fit & predict function
-    """
+
+def select(X: pd.DataFrame) -> pd.DataFrame: 
+    # Ignore feature selection logic
     return X
 
 
 def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
     """Define and train the model. Merge feature_select"""
+    X_train = select(X_train)
+    X_valid = select(X_valid)
     dtrain = xgb.DMatrix(X_train, label=y_train)
     dvalid = xgb.DMatrix(X_valid, label=y_valid)
 
     # TODO: for quick running....
-    params = {}
-    num_round = 50
+    params = {
+        "nthred": -1,
+    }
+    num_round = 200
 
     evallist = [(dtrain, "train"), (dvalid, "eval")]
     bst = xgb.train(params, dtrain, num_round, evallist)
@@ -32,6 +35,7 @@ def predict(model, X):
     """
     Keep feature select's consistency.
     """
+    X = select(X)
     dtest = xgb.DMatrix(X)
     y_pred_prob = model.predict(dtest)
-    return y_pred_prob
+    return y_pred_prob
diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -10,6 +10,7 @@ kg_description_template:
       "Target Description": "A description of the target variable to be predicted",
       "Competition Features": "A dict of relevant features used in the competition and their descriptions (if available)", # if you are not sure about the meaning of the feature, please add a (guess) before the description. Importantly, your feature name should be exactly the same as the feature name in the dataset!
     }
+    Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
 
 
   user: |-
@@ -144,7 +145,7 @@ kg_model_interface: |-
   from xgboost import DMatrix
 
 
-  def select(self, X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
 
 
   def fit(
@@ -178,7 +179,7 @@ kg_model_interface: |-
   from sklearn.metrics import accuracy_score
 
 
-  def select(self, X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
 
 
   def fit(
@@ -207,7 +208,7 @@ kg_model_interface: |-
   from lightgbm import LGBMClassifier, LGBMRegressor
 
 
-  def select(self, X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
 
 
   def fit(
@@ -247,7 +248,7 @@ kg_model_interface: |-
           return x
 
 
-  def select(self, X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
 
 
   def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> torch.nn.Module:

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -1,5 +1,7 @@
+import io
 import json
 from pathlib import Path
+import pickle
 
 import pandas as pd
 from jinja2 import Environment, StrictUndefined
@@ -93,9 +95,12 @@ def background(self) -> str:
     def source_data(self) -> str:
         data_folder = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / self.competition
 
-        if (data_folder / "valid.pkl").exists():
-            X_valid = pd.read_pickle(data_folder / "valid.pkl")
-            return X_valid.head()
+        if (data_folder / "X_valid.pkl").exists():
+            X_valid = pd.read_pickle(data_folder / "X_valid.pkl")
+            buffer = io.StringIO()
+            X_valid.info(verbose=True, buf=buffer, show_counts=True)
+            data_info = buffer.getvalue()
+            return data_info
 
         preprocess_experiment = KGFactorExperiment([])
         (
@@ -108,8 +113,17 @@ def source_data(self) -> str:
         ) = preprocess_experiment.experiment_workspace.generate_preprocess_data()
 
         data_folder.mkdir(exist_ok=True, parents=True)
-        X_valid.to_pickle(data_folder / "valid.pkl")
-        return X_valid.head()
+        pickle.dump(X_train, open(data_folder / "X_train.pkl", "wb"))
+        pickle.dump(X_valid, open(data_folder / "X_valid.pkl", "wb"))
+        pickle.dump(y_train, open(data_folder / "y_train.pkl", "wb"))
+        pickle.dump(y_valid, open(data_folder / "y_valid.pkl", "wb"))
+        pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb"))
+        pickle.dump(passenger_ids, open(data_folder / "passenger_ids.pkl", "wb"))
+
+        buffer = io.StringIO()
+        X_valid.info(verbose=True, buf=buffer, show_counts=True)
+        data_info = buffer.getvalue()
+        return data_info
 
     @property
     def output_format(self) -> str:

diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
 from rdagent.core.experiment import FBWorkspace
 from rdagent.log import rdagent_logger as logger
 from rdagent.utils.env import KGDockerEnv
@@ -58,6 +59,11 @@ def generate_preprocess_data(
 
     def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
         logger.info(f"Running the experiment in {self.workspace_path}")
+
+        # link the data to the workspace to speed up the preprocessing
+        source_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition
+        self.link_data_to_workspace(source_data_path, self.workspace_path)
+
         kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
         kgde.prepare()
 

diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -82,18 +82,22 @@ def __init__(self, scen: Scenario, knowledge: VectorBase = None) -> Tuple[dict,
         self.scen.vector_base.save(KAGGLE_IMPLEMENT_SETTING.rag_path)
 
     def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
-        hypothesis_feedback = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["hypothesis_and_feedback"])
-            .render(trace=trace)
+        hypothesis_and_feedback = (
+            (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["hypothesis_and_feedback"])
+                .render(trace=trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        rag_results, _ = self.scen.vector_base.search_experience(hypothesis_feedback, topk_k=5)
+        rag_results, _ = self.scen.vector_base.search_experience(hypothesis_and_feedback, topk_k=5)
         rag_content = "\n".join([doc.content for doc in rag_results])
 
         context_dict = {
-            "hypothesis_and_feedback": hypothesis_feedback,
-            "RAG": rag_content,
+            "hypothesis_and_feedback": hypothesis_and_feedback,
+            "RAG": None,
             "hypothesis_output_format": prompt_dict["hypothesis_output_format"],
             "hypothesis_specification": None,
         }
@@ -125,9 +129,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
         self.current_action = hypothesis.action
 
         hypothesis_and_feedback = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["hypothesis_and_feedback"])
-            .render(trace=trace)
+            (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["hypothesis_and_feedback"])
+                .render(trace=trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
         )
 
         experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]

diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
@@ -24,14 +24,18 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
         super().__init__(scen)
 
     def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
-        hypothesis_feedback = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["hypothesis_and_feedback"])
-            .render(trace=trace)
+        hypothesis_and_feedback = (
+            (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["hypothesis_and_feedback"])
+                .render(trace=trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
         )
         context_dict = {
-            "hypothesis_and_feedback": hypothesis_feedback,
-            "RAG": ...,
+            "hypothesis_and_feedback": hypothesis_and_feedback,
+            "RAG": None,
             "hypothesis_output_format": prompt_dict["hypothesis_output_format"],
             "hypothesis_specification": prompt_dict["factor_hypothesis_specification"],
         }
@@ -56,9 +60,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
         experiment_output_format = prompt_dict["factor_experiment_output_format"]
 
         hypothesis_and_feedback = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["hypothesis_and_feedback"])
-            .render(trace=trace)
+            (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["hypothesis_and_feedback"])
+                .render(trace=trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
         )
 
         experiment_list: List[FactorExperiment] = [t[1] for t in trace.hist]
@@ -73,7 +81,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
             "hypothesis_and_feedback": hypothesis_and_feedback,
             "experiment_output_format": experiment_output_format,
             "target_list": factor_list,
-            "RAG": ...,
+            "RAG": None,
         }, True
 
     def convert_response(self, response: str, trace: Trace) -> FactorExperiment: