Skip to content

Commit

Permalink
improve_execution_time_in_kaggle_loop
Browse files Browse the repository at this point in the history
  • Loading branch information
peteryang1 committed Sep 19, 2024
1 parent 3535889 commit 8dd3c8c
Show file tree
Hide file tree
Showing 14 changed files with 140 additions and 74 deletions.
13 changes: 0 additions & 13 deletions rdagent/components/coder/factor_coder/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,19 +87,6 @@ def __init__(
self.executed_factor_value_dataframe = executed_factor_value_dataframe
self.raise_exception = raise_exception

@staticmethod
def link_data_to_workspace(data_path: Path, workspace_path: Path):
data_path = Path(data_path)
workspace_path = Path(workspace_path)
for data_file_path in data_path.iterdir():
workspace_data_file_path = workspace_path / data_file_path.name
if workspace_data_file_path.exists():
workspace_data_file_path.unlink()
subprocess.run(
["ln", "-s", data_file_path, workspace_data_file_path],
check=False,
)

def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]:
"""
execute the implementation and get the factor value by the following steps:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import numpy as np
import pandas as pd
from factor import feature_engineering_cls

if os.path.exists("valid.pkl"):
valid_df = pd.read_pickle("valid.pkl")
if os.path.exists("X_valid.pkl"):
valid_df = pd.read_pickle("X_valid.pkl").head(1000)
else:
raise FileNotFoundError("No valid data found.")

Expand Down
14 changes: 14 additions & 0 deletions rdagent/core/experiment.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import shutil
import subprocess
import uuid
from abc import ABC, abstractmethod
from copy import deepcopy
Expand Down Expand Up @@ -111,6 +112,19 @@ def prepare(self) -> None:
"""
self.workspace_path.mkdir(parents=True, exist_ok=True)

@staticmethod
def link_data_to_workspace(data_path: Path, workspace_path: Path):
data_path = Path(data_path)
workspace_path = Path(workspace_path)
for data_file_path in data_path.iterdir():
workspace_data_file_path = workspace_path / data_file_path.name
if workspace_data_file_path.exists():
workspace_data_file_path.unlink()
subprocess.run(
["ln", "-s", data_file_path, workspace_data_file_path],
check=False,
)

def inject_code(self, **files: str) -> None:
"""
Inject the code into the folder.
Expand Down
28 changes: 18 additions & 10 deletions rdagent/scenarios/data_mining/proposal/model_proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,18 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
super().__init__(scen)

def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
hypothesis_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
hypothesis_and_feedback = (
(
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)
if len(trace.hist) > 0
else "No previous hypothesis and feedback available since it's the first round."
)
context_dict = {
"hypothesis_and_feedback": hypothesis_feedback,
"RAG": "",
"hypothesis_and_feedback": hypothesis_and_feedback,
"RAG": None,
"hypothesis_output_format": prompt_dict["hypothesis_output_format"],
"hypothesis_specification": prompt_dict["model_hypothesis_specification"],
}
Expand All @@ -67,9 +71,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
experiment_output_format = prompt_dict["model_experiment_output_format"]

hypothesis_and_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
(
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)
if len(trace.hist) > 0
else "No previous hypothesis and feedback available since it's the first round."
)

experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
Expand All @@ -84,7 +92,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
"hypothesis_and_feedback": hypothesis_and_feedback,
"experiment_output_format": experiment_output_format,
"target_list": model_list,
"RAG": ...,
"RAG": None,
}, True

def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
Expand Down Expand Up @@ -82,6 +83,15 @@ def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("X_train.pkl"):
X_train = pd.read_pickle("X_train.pkl")
X_valid = pd.read_pickle("X_valid.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
passenger_ids = pd.read_pickle("passenger_ids.pkl")

return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
X_train, X_valid, y_train, y_valid = prepreprocess()

# Fit the preprocessor on the training data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=32)
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
Expand Down
18 changes: 11 additions & 7 deletions rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,24 @@
import xgboost as xgb


def select(X):
"""
Select relevant features. To be used in fit & predict function
"""

def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)
X_valid = select(X_valid)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# TODO: for quick running....
params = {}
num_round = 50
params = {
"nthred": -1,
}
num_round = 200

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)
Expand All @@ -32,6 +35,7 @@ def predict(model, X):
"""
Keep feature select's consistency.
"""
X = select(X)
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
return y_pred_prob
return y_pred_prob
9 changes: 5 additions & 4 deletions rdagent/scenarios/kaggle/experiment/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ kg_description_template:
"Target Description": "A description of the target variable to be predicted",
"Competition Features": "A dict of relevant features used in the competition and their descriptions (if available)", # if you are not sure about the meaning of the feature, please add a (guess) before the description. Importantly, your feature name should be exactly the same as the feature name in the dataset!
}
Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
user: |-
Expand Down Expand Up @@ -144,7 +145,7 @@ kg_model_interface: |-
from xgboost import DMatrix
def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def fit(
Expand Down Expand Up @@ -178,7 +179,7 @@ kg_model_interface: |-
from sklearn.metrics import accuracy_score
def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def fit(
Expand Down Expand Up @@ -207,7 +208,7 @@ kg_model_interface: |-
from lightgbm import LGBMClassifier, LGBMRegressor
def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def fit(
Expand Down Expand Up @@ -247,7 +248,7 @@ kg_model_interface: |-
return x
def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> torch.nn.Module:
Expand Down
24 changes: 19 additions & 5 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import io
import json
from pathlib import Path
import pickle

import pandas as pd
from jinja2 import Environment, StrictUndefined
Expand Down Expand Up @@ -93,9 +95,12 @@ def background(self) -> str:
def source_data(self) -> str:
data_folder = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / self.competition

if (data_folder / "valid.pkl").exists():
X_valid = pd.read_pickle(data_folder / "valid.pkl")
return X_valid.head()
if (data_folder / "X_valid.pkl").exists():
X_valid = pd.read_pickle(data_folder / "X_valid.pkl")
buffer = io.StringIO()
X_valid.info(verbose=True, buf=buffer, show_counts=True)
data_info = buffer.getvalue()
return data_info

preprocess_experiment = KGFactorExperiment([])
(
Expand All @@ -108,8 +113,17 @@ def source_data(self) -> str:
) = preprocess_experiment.experiment_workspace.generate_preprocess_data()

data_folder.mkdir(exist_ok=True, parents=True)
X_valid.to_pickle(data_folder / "valid.pkl")
return X_valid.head()
pickle.dump(X_train, open(data_folder / "X_train.pkl", "wb"))
pickle.dump(X_valid, open(data_folder / "X_valid.pkl", "wb"))
pickle.dump(y_train, open(data_folder / "y_train.pkl", "wb"))
pickle.dump(y_valid, open(data_folder / "y_valid.pkl", "wb"))
pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb"))
pickle.dump(passenger_ids, open(data_folder / "passenger_ids.pkl", "wb"))

buffer = io.StringIO()
X_valid.info(verbose=True, buf=buffer, show_counts=True)
data_info = buffer.getvalue()
return data_info

@property
def output_format(self) -> str:
Expand Down
6 changes: 6 additions & 0 deletions rdagent/scenarios/kaggle/experiment/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
from rdagent.core.experiment import FBWorkspace
from rdagent.log import rdagent_logger as logger
from rdagent.utils.env import KGDockerEnv
Expand Down Expand Up @@ -58,6 +59,11 @@ def generate_preprocess_data(

def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
logger.info(f"Running the experiment in {self.workspace_path}")

# link the data to the workspace to speed up the preprocessing
source_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition
self.link_data_to_workspace(source_data_path, self.workspace_path)

kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
kgde.prepare()

Expand Down
28 changes: 18 additions & 10 deletions rdagent/scenarios/kaggle/proposal/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,22 @@ def __init__(self, scen: Scenario, knowledge: VectorBase = None) -> Tuple[dict,
self.scen.vector_base.save(KAGGLE_IMPLEMENT_SETTING.rag_path)

def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
hypothesis_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
hypothesis_and_feedback = (
(
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)
if len(trace.hist) > 0
else "No previous hypothesis and feedback available since it's the first round."
)

rag_results, _ = self.scen.vector_base.search_experience(hypothesis_feedback, topk_k=5)
rag_results, _ = self.scen.vector_base.search_experience(hypothesis_and_feedback, topk_k=5)
rag_content = "\n".join([doc.content for doc in rag_results])

context_dict = {
"hypothesis_and_feedback": hypothesis_feedback,
"RAG": rag_content,
"hypothesis_and_feedback": hypothesis_and_feedback,
"RAG": None,
"hypothesis_output_format": prompt_dict["hypothesis_output_format"],
"hypothesis_specification": None,
}
Expand Down Expand Up @@ -125,9 +129,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
self.current_action = hypothesis.action

hypothesis_and_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
(
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)
if len(trace.hist) > 0
else "No previous hypothesis and feedback available since it's the first round."
)

experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
Expand Down
28 changes: 18 additions & 10 deletions rdagent/scenarios/qlib/proposal/factor_proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,18 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
super().__init__(scen)

def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
hypothesis_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
hypothesis_and_feedback = (
(
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)
if len(trace.hist) > 0
else "No previous hypothesis and feedback available since it's the first round."
)
context_dict = {
"hypothesis_and_feedback": hypothesis_feedback,
"RAG": ...,
"hypothesis_and_feedback": hypothesis_and_feedback,
"RAG": None,
"hypothesis_output_format": prompt_dict["hypothesis_output_format"],
"hypothesis_specification": prompt_dict["factor_hypothesis_specification"],
}
Expand All @@ -56,9 +60,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
experiment_output_format = prompt_dict["factor_experiment_output_format"]

hypothesis_and_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
(
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)
if len(trace.hist) > 0
else "No previous hypothesis and feedback available since it's the first round."
)

experiment_list: List[FactorExperiment] = [t[1] for t in trace.hist]
Expand All @@ -73,7 +81,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
"hypothesis_and_feedback": hypothesis_and_feedback,
"experiment_output_format": experiment_output_format,
"target_list": factor_list,
"RAG": ...,
"RAG": None,
}, True

def convert_response(self, response: str, trace: Trace) -> FactorExperiment:
Expand Down
Loading

0 comments on commit 8dd3c8c

Please sign in to comment.