From d74da1fd56550652c27cc005f4fd1c93631b59e7 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Wed, 18 Sep 2024 11:20:12 +0000 Subject: [PATCH 1/2] update new feature engineering code format --- .../factor_execution_template.txt | 6 ++-- .../experiment/meta_tpl/feature/feature.py | 22 ++++++++---- .../kaggle/experiment/meta_tpl/train.py | 9 ++--- .../scenarios/kaggle/experiment/prompts.yaml | 36 +++++++++++++------ 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template.txt index af7d4366..a25d5886 100644 --- a/rdagent/components/coder/factor_coder/factor_execution_template.txt +++ b/rdagent/components/coder/factor_coder/factor_execution_template.txt @@ -2,12 +2,14 @@ import os import numpy as np import pandas as pd -from factor import feat_eng +from factor import feature_engineering_cls if os.path.exists("valid.pkl"): valid_df = pd.read_pickle("valid.pkl") else: raise FileNotFoundError("No valid data found.") -new_feat = feat_eng(valid_df) +cls = feature_engineering_cls() +cls.fit(valid_df) +new_feat = cls.transform(valid_df) new_feat.to_hdf("result.h5", key="data", mode="w") diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py index 7dd65426..43468417 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py @@ -1,13 +1,21 @@ import pandas as pd """ -Here is the feature engineering code for each task, with the function name specified as feat_eng. -The file name should start with feat_, followed by the specific task name. +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember """ +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass -def feat_eng(X: pd.DataFrame): - """ - return the selected features - """ - return X + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + +feature_engineering_cls = IdentityFeature \ No newline at end of file diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py index 445cf269..cf9f180d 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py @@ -44,10 +44,11 @@ def import_module_from_path(module_name, module_path): X_test_l = [] for f in DIRNAME.glob("feature/feat*.py"): - m = import_module_from_path(f.stem, f) - X_train_f = m.feat_eng(X_train) - X_valid_f = m.feat_eng(X_valid) - X_test_f = m.feat_eng(X_test) + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) X_train_l.append(X_train_f) X_valid_l.append(X_valid_f) diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml index eb1783bd..dcbc54ea 100644 --- a/rdagent/scenarios/kaggle/experiment/prompts.yaml +++ b/rdagent/scenarios/kaggle/experiment/prompts.yaml @@ -66,11 +66,13 @@ kg_background: |- kg_feature_interface: |- Your code should contain several parts: 1. The import part: import the necessary libraries. - 2. A feat_eng() function that handles feature engineering for each task. - The function should take the following arguments: - - X: The features as a pandas DataFrame. - The function should return the new features as a pandas DataFrame. - The input to `feat_eng` will be a pandas DataFrame, which should be processed to return a new DataFrame containing only the engineered features. + 2. A class that contains the feature engineering logic. + The class should have the following methods: + - fit: This method should fit the feature engineering model to the training data. + - transform: This method should transform the input data and return it. + For some tasks like generating new features, the fit method may not be necessary. Please pass this function as a no-op. + 3. A variable called feature_engineering_cls that contains the class name. + The input to 'fit' is the training data in pandas dataframe, and the input to 'transform' is the data to be transformed in pandas dataframe. The original columns should be excluded from the returned DataFrame. Exception handling will be managed externally, so avoid using try-except blocks in your code. The user will handle any exceptions that arise and provide feedback as needed. @@ -83,12 +85,24 @@ kg_feature_interface: |- ```python import pandas as pd - def feat_eng(X: pd.DataFrame): - """ - return the selected features - """ - return X.mean(axis=1).to_frame("mean_feature") # Example feature engineering - return X.fillna(0) # Example feature processing + class FeatureEngineeringName: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + For example, for one hot encoding, this would involve fitting the encoder to the training data. + For feature scaling, this would involve fitting the scaler to the training data. + """ + return self + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + return X.mean(axis=1).to_frame("mean_feature") # Example feature engineering + return X.fillna(0) # Example feature processing + + feature_engineering_cls = FeatureEngineeringName ``` To Note: From 8dd0adcdb0a4d9ed0fbbd3bd5ea8f7b823c2f21b Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 19 Sep 2024 02:43:45 +0000 Subject: [PATCH 2/2] fix CI --- .../scenarios/kaggle/experiment/meta_tpl/feature/feature.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py index 43468417..8ae043ac 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py @@ -5,6 +5,7 @@ Remember """ + class IdentityFeature: def fit(self, train_df: pd.DataFrame): """ @@ -18,4 +19,5 @@ def transform(self, X: pd.DataFrame): """ return X -feature_engineering_cls = IdentityFeature \ No newline at end of file + +feature_engineering_cls = IdentityFeature