From 8dd3c8c905f376c73464c9dd1e3374cc7b16c3b4 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 19 Sep 2024 10:59:03 +0000 Subject: [PATCH] improve_execution_time_in_kaggle_loop --- .../components/coder/factor_coder/factor.py | 13 --------- .../factor_execution_template.txt | 4 +-- rdagent/core/experiment.py | 14 ++++++++++ .../data_mining/proposal/model_proposal.py | 28 ++++++++++++------- .../meta_tpl/cross_validation_tpl.py | 4 +-- .../meta_tpl/fea_share_preprocess.py | 10 +++++++ .../experiment/meta_tpl/model/model_rf.py | 2 +- .../experiment/meta_tpl/model/model_xgb.py | 18 +++++++----- .../scenarios/kaggle/experiment/prompts.yaml | 9 +++--- .../scenarios/kaggle/experiment/scenario.py | 24 ++++++++++++---- .../scenarios/kaggle/experiment/workspace.py | 6 ++++ rdagent/scenarios/kaggle/proposal/proposal.py | 28 ++++++++++++------- .../qlib/proposal/factor_proposal.py | 28 ++++++++++++------- .../scenarios/qlib/proposal/model_proposal.py | 26 +++++++++++------ 14 files changed, 140 insertions(+), 74 deletions(-) diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py index f794e4c1..7c1a0828 100644 --- a/rdagent/components/coder/factor_coder/factor.py +++ b/rdagent/components/coder/factor_coder/factor.py @@ -87,19 +87,6 @@ def __init__( self.executed_factor_value_dataframe = executed_factor_value_dataframe self.raise_exception = raise_exception - @staticmethod - def link_data_to_workspace(data_path: Path, workspace_path: Path): - data_path = Path(data_path) - workspace_path = Path(workspace_path) - for data_file_path in data_path.iterdir(): - workspace_data_file_path = workspace_path / data_file_path.name - if workspace_data_file_path.exists(): - workspace_data_file_path.unlink() - subprocess.run( - ["ln", "-s", data_file_path, workspace_data_file_path], - check=False, - ) - def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]: """ execute the implementation and get the factor value by the following steps: diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template.txt index a25d5886..60c07bf6 100644 --- a/rdagent/components/coder/factor_coder/factor_execution_template.txt +++ b/rdagent/components/coder/factor_coder/factor_execution_template.txt @@ -4,8 +4,8 @@ import numpy as np import pandas as pd from factor import feature_engineering_cls -if os.path.exists("valid.pkl"): - valid_df = pd.read_pickle("valid.pkl") +if os.path.exists("X_valid.pkl"): + valid_df = pd.read_pickle("X_valid.pkl").head(1000) else: raise FileNotFoundError("No valid data found.") diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py index f6619876..1602d24f 100644 --- a/rdagent/core/experiment.py +++ b/rdagent/core/experiment.py @@ -1,6 +1,7 @@ from __future__ import annotations import shutil +import subprocess import uuid from abc import ABC, abstractmethod from copy import deepcopy @@ -111,6 +112,19 @@ def prepare(self) -> None: """ self.workspace_path.mkdir(parents=True, exist_ok=True) + @staticmethod + def link_data_to_workspace(data_path: Path, workspace_path: Path): + data_path = Path(data_path) + workspace_path = Path(workspace_path) + for data_file_path in data_path.iterdir(): + workspace_data_file_path = workspace_path / data_file_path.name + if workspace_data_file_path.exists(): + workspace_data_file_path.unlink() + subprocess.run( + ["ln", "-s", data_file_path, workspace_data_file_path], + check=False, + ) + def inject_code(self, **files: str) -> None: """ Inject the code into the folder. diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py index f1bc0cbc..a4efc9b7 100644 --- a/rdagent/scenarios/data_mining/proposal/model_proposal.py +++ b/rdagent/scenarios/data_mining/proposal/model_proposal.py @@ -35,14 +35,18 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]: super().__init__(scen) def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: - hypothesis_feedback = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["hypothesis_and_feedback"]) - .render(trace=trace) + hypothesis_and_feedback = ( + ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["hypothesis_and_feedback"]) + .render(trace=trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." ) context_dict = { - "hypothesis_and_feedback": hypothesis_feedback, - "RAG": "", + "hypothesis_and_feedback": hypothesis_and_feedback, + "RAG": None, "hypothesis_output_format": prompt_dict["hypothesis_output_format"], "hypothesis_specification": prompt_dict["model_hypothesis_specification"], } @@ -67,9 +71,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b experiment_output_format = prompt_dict["model_experiment_output_format"] hypothesis_and_feedback = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["hypothesis_and_feedback"]) - .render(trace=trace) + ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["hypothesis_and_feedback"]) + .render(trace=trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." ) experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist] @@ -84,7 +92,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b "hypothesis_and_feedback": hypothesis_and_feedback, "experiment_output_format": experiment_output_format, "target_list": model_list, - "RAG": ..., + "RAG": None, }, True def convert_response(self, response: str, trace: Trace) -> ModelExperiment: diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py index 528931d8..90ec0c2a 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py @@ -2,10 +2,8 @@ import numpy as np import pandas as pd -import xgboost as xgb -from sklearn.metrics import accuracy_score, matthews_corrcoef from sklearn.model_selection import KFold -from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from sklearn.preprocessing import LabelEncoder from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py index 787e1698..04233b8f 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py @@ -1,3 +1,4 @@ +import os import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer @@ -82,6 +83,15 @@ def preprocess_script(): """ This method applies the preprocessing steps to the training, validation, and test datasets. """ + if os.path.exists("X_train.pkl"): + X_train = pd.read_pickle("X_train.pkl") + X_valid = pd.read_pickle("X_valid.pkl") + y_train = pd.read_pickle("y_train.pkl") + y_valid = pd.read_pickle("y_valid.pkl") + X_test = pd.read_pickle("X_test.pkl") + passenger_ids = pd.read_pickle("passenger_ids.pkl") + + return X_train, X_valid, y_train, y_valid, X_test, passenger_ids X_train, X_valid, y_train, y_valid = prepreprocess() # Fit the preprocessor on the training data diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_rf.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_rf.py index 3ec75868..3c64a094 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_rf.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_rf.py @@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali Define and train the Random Forest model. Merge feature selection into the pipeline. """ # Initialize the Random Forest model - model = RandomForestClassifier(n_estimators=100, random_state=32) + model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1) # Select features (if any feature selection is needed) X_train_selected = select(X_train) diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py index 74fb6399..35bc7317 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py @@ -6,21 +6,24 @@ import xgboost as xgb -def select(X): - """ - Select relevant features. To be used in fit & predict function - """ + +def select(X: pd.DataFrame) -> pd.DataFrame: + # Ignore feature selection logic return X def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): """Define and train the model. Merge feature_select""" + X_train = select(X_train) + X_valid = select(X_valid) dtrain = xgb.DMatrix(X_train, label=y_train) dvalid = xgb.DMatrix(X_valid, label=y_valid) # TODO: for quick running.... - params = {} - num_round = 50 + params = { + "nthred": -1, + } + num_round = 200 evallist = [(dtrain, "train"), (dvalid, "eval")] bst = xgb.train(params, dtrain, num_round, evallist) @@ -32,6 +35,7 @@ def predict(model, X): """ Keep feature select's consistency. """ + X = select(X) dtest = xgb.DMatrix(X) y_pred_prob = model.predict(dtest) - return y_pred_prob + return y_pred_prob \ No newline at end of file diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml index dcbc54ea..4dbdb6dd 100644 --- a/rdagent/scenarios/kaggle/experiment/prompts.yaml +++ b/rdagent/scenarios/kaggle/experiment/prompts.yaml @@ -10,6 +10,7 @@ kg_description_template: "Target Description": "A description of the target variable to be predicted", "Competition Features": "A dict of relevant features used in the competition and their descriptions (if available)", # if you are not sure about the meaning of the feature, please add a (guess) before the description. Importantly, your feature name should be exactly the same as the feature name in the dataset! } + Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together. user: |- @@ -144,7 +145,7 @@ kg_model_interface: |- from xgboost import DMatrix - def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic + def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic def fit( @@ -178,7 +179,7 @@ kg_model_interface: |- from sklearn.metrics import accuracy_score - def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic + def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic def fit( @@ -207,7 +208,7 @@ kg_model_interface: |- from lightgbm import LGBMClassifier, LGBMRegressor - def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic + def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic def fit( @@ -247,7 +248,7 @@ kg_model_interface: |- return x - def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic + def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> torch.nn.Module: diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index 59316cdb..e64817e0 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -1,5 +1,7 @@ +import io import json from pathlib import Path +import pickle import pandas as pd from jinja2 import Environment, StrictUndefined @@ -93,9 +95,12 @@ def background(self) -> str: def source_data(self) -> str: data_folder = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / self.competition - if (data_folder / "valid.pkl").exists(): - X_valid = pd.read_pickle(data_folder / "valid.pkl") - return X_valid.head() + if (data_folder / "X_valid.pkl").exists(): + X_valid = pd.read_pickle(data_folder / "X_valid.pkl") + buffer = io.StringIO() + X_valid.info(verbose=True, buf=buffer, show_counts=True) + data_info = buffer.getvalue() + return data_info preprocess_experiment = KGFactorExperiment([]) ( @@ -108,8 +113,17 @@ def source_data(self) -> str: ) = preprocess_experiment.experiment_workspace.generate_preprocess_data() data_folder.mkdir(exist_ok=True, parents=True) - X_valid.to_pickle(data_folder / "valid.pkl") - return X_valid.head() + pickle.dump(X_train, open(data_folder / "X_train.pkl", "wb")) + pickle.dump(X_valid, open(data_folder / "X_valid.pkl", "wb")) + pickle.dump(y_train, open(data_folder / "y_train.pkl", "wb")) + pickle.dump(y_valid, open(data_folder / "y_valid.pkl", "wb")) + pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb")) + pickle.dump(passenger_ids, open(data_folder / "passenger_ids.pkl", "wb")) + + buffer = io.StringIO() + X_valid.info(verbose=True, buf=buffer, show_counts=True) + data_info = buffer.getvalue() + return data_info @property def output_format(self) -> str: diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py index 42388785..884a5e73 100644 --- a/rdagent/scenarios/kaggle/experiment/workspace.py +++ b/rdagent/scenarios/kaggle/experiment/workspace.py @@ -5,6 +5,7 @@ import pandas as pd from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING +from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS from rdagent.core.experiment import FBWorkspace from rdagent.log import rdagent_logger as logger from rdagent.utils.env import KGDockerEnv @@ -58,6 +59,11 @@ def generate_preprocess_data( def execute(self, run_env: dict = {}, *args, **kwargs) -> str: logger.info(f"Running the experiment in {self.workspace_path}") + + # link the data to the workspace to speed up the preprocessing + source_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition + self.link_data_to_workspace(source_data_path, self.workspace_path) + kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition) kgde.prepare() diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py index 0851e8d1..ab0359a7 100644 --- a/rdagent/scenarios/kaggle/proposal/proposal.py +++ b/rdagent/scenarios/kaggle/proposal/proposal.py @@ -82,18 +82,22 @@ def __init__(self, scen: Scenario, knowledge: VectorBase = None) -> Tuple[dict, self.scen.vector_base.save(KAGGLE_IMPLEMENT_SETTING.rag_path) def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: - hypothesis_feedback = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["hypothesis_and_feedback"]) - .render(trace=trace) + hypothesis_and_feedback = ( + ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["hypothesis_and_feedback"]) + .render(trace=trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." ) - rag_results, _ = self.scen.vector_base.search_experience(hypothesis_feedback, topk_k=5) + rag_results, _ = self.scen.vector_base.search_experience(hypothesis_and_feedback, topk_k=5) rag_content = "\n".join([doc.content for doc in rag_results]) context_dict = { - "hypothesis_and_feedback": hypothesis_feedback, - "RAG": rag_content, + "hypothesis_and_feedback": hypothesis_and_feedback, + "RAG": None, "hypothesis_output_format": prompt_dict["hypothesis_output_format"], "hypothesis_specification": None, } @@ -125,9 +129,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b self.current_action = hypothesis.action hypothesis_and_feedback = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["hypothesis_and_feedback"]) - .render(trace=trace) + ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["hypothesis_and_feedback"]) + .render(trace=trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." ) experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist] diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py index ef58fc98..3856a179 100644 --- a/rdagent/scenarios/qlib/proposal/factor_proposal.py +++ b/rdagent/scenarios/qlib/proposal/factor_proposal.py @@ -24,14 +24,18 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]: super().__init__(scen) def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: - hypothesis_feedback = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["hypothesis_and_feedback"]) - .render(trace=trace) + hypothesis_and_feedback = ( + ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["hypothesis_and_feedback"]) + .render(trace=trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." ) context_dict = { - "hypothesis_and_feedback": hypothesis_feedback, - "RAG": ..., + "hypothesis_and_feedback": hypothesis_and_feedback, + "RAG": None, "hypothesis_output_format": prompt_dict["hypothesis_output_format"], "hypothesis_specification": prompt_dict["factor_hypothesis_specification"], } @@ -56,9 +60,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict | experiment_output_format = prompt_dict["factor_experiment_output_format"] hypothesis_and_feedback = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["hypothesis_and_feedback"]) - .render(trace=trace) + ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["hypothesis_and_feedback"]) + .render(trace=trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." ) experiment_list: List[FactorExperiment] = [t[1] for t in trace.hist] @@ -73,7 +81,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict | "hypothesis_and_feedback": hypothesis_and_feedback, "experiment_output_format": experiment_output_format, "target_list": factor_list, - "RAG": ..., + "RAG": None, }, True def convert_response(self, response: str, trace: Trace) -> FactorExperiment: diff --git a/rdagent/scenarios/qlib/proposal/model_proposal.py b/rdagent/scenarios/qlib/proposal/model_proposal.py index 274c3273..614b25ba 100644 --- a/rdagent/scenarios/qlib/proposal/model_proposal.py +++ b/rdagent/scenarios/qlib/proposal/model_proposal.py @@ -24,13 +24,17 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]: super().__init__(scen) def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: - hypothesis_feedback = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["hypothesis_and_feedback"]) - .render(trace=trace) + hypothesis_and_feedback = ( + ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["hypothesis_and_feedback"]) + .render(trace=trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." ) context_dict = { - "hypothesis_and_feedback": hypothesis_feedback, + "hypothesis_and_feedback": hypothesis_and_feedback, "RAG": "In Quantitative Finance, market data could be time-series, and GRU model/LSTM model are suitable for them. Do not generate GNN model as for now.", "hypothesis_output_format": prompt_dict["hypothesis_output_format"], "hypothesis_specification": prompt_dict["model_hypothesis_specification"], @@ -56,9 +60,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b experiment_output_format = prompt_dict["model_experiment_output_format"] hypothesis_and_feedback = ( - Environment(undefined=StrictUndefined) - .from_string(prompt_dict["hypothesis_and_feedback"]) - .render(trace=trace) + ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["hypothesis_and_feedback"]) + .render(trace=trace) + ) + if len(trace.hist) > 0 + else "No previous hypothesis and feedback available since it's the first round." ) experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist] @@ -73,7 +81,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b "hypothesis_and_feedback": hypothesis_and_feedback, "experiment_output_format": experiment_output_format, "target_list": model_list, - "RAG": ..., + "RAG": None, }, True def convert_response(self, response: str, trace: Trace) -> ModelExperiment: