From cde7c1f2e32172baf9c9577379061801d9fa3f34 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Thu, 19 Sep 2024 09:25:54 +0000
Subject: [PATCH 1/4] add kaggle test

---
 test/utils/test_kaggle.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 test/utils/test_kaggle.py

diff --git a/test/utils/test_kaggle.py b/test/utils/test_kaggle.py
new file mode 100644
index 00000000..a434470d
--- /dev/null
+++ b/test/utils/test_kaggle.py
@@ -0,0 +1,17 @@
+import unittest
+import nbformat
+
+
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils.agent.ret import PythonAgentOut
+from rdagent.utils.agent.tpl import T
+
+
+class TestTplGen(unittest.TestCase):
+    def generate(self, competition: str = "feedback-prize-english-language-learning"):
+        
+        print(competition)
+
+
+if __name__ == "__main__":
+    unittest.main()

From a7f12f2a2eff7b8c74539fe970e9fb88fc5a419d Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 20 Sep 2024 12:40:32 +0000
Subject: [PATCH 2/4] kaggle templates changes

---
 rdagent/app/kaggle/conf.py                    |   2 +
 rdagent/app/kaggle/loop.py                    |   6 +-
 rdagent/scenarios/kaggle/developer/runner.py  |   2 -
 .../fea_share_preprocess.py                   | 206 ++++++++++++++++++
 .../feature/feature.py                        |  15 ++
 .../model/model.py                            |  15 ++
 .../train.py                                  |  34 +++
 .../kaggle/experiment/kaggle_experiment.py    |   6 +-
 .../cross_validation_tpl.py                   |  87 ++++++++
 .../fea_share_preprocess.py                   | 111 ++++++++++
 .../feature/feature.py                        |  23 ++
 .../model/model_rf.py                         |  54 +++++
 .../model/model_xgb.py                        |  40 ++++
 .../playground-series-s4e8_template/train.py  | 121 ++++++++++
 .../scenarios/kaggle/experiment/scenario.py   |   2 +-
 .../scenarios/kaggle/experiment/workspace.py  |   6 +-
 rdagent/scenarios/kaggle/kaggle_crawler.py    |  21 +-
 rdagent/utils/env.py                          |  36 +--
 test/utils/test_kaggle.py                     |  21 +-
 19 files changed, 758 insertions(+), 50 deletions(-)
 create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/feature/feature.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_rf.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgb.py
 create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py

diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
index ea5594c1..b1f30fbd 100644
--- a/rdagent/app/kaggle/conf.py
+++ b/rdagent/app/kaggle/conf.py
@@ -42,6 +42,8 @@ class Config:
 
     competition: str = ""
 
+    local_data_path: str = "/data/userdata/share/kaggle"
+
     rag_path: str = "git_ignore_folder/rag"
 
 
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index e4fac885..afb98fb8 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -2,7 +2,7 @@
 from typing import Any
 
 import fire
-
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import RDLoop
@@ -88,6 +88,10 @@ def main(path=None, step_n=None, competition=None):
     """
     if competition:
         KAGGLE_IMPLEMENT_SETTING.competition = competition
+        download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
+    else:
+        logger.error("Please specify competition name.")
+
     if path is None:
         model_loop = ModelRDLoop(KAGGLE_IMPLEMENT_SETTING)
     else:
diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
index 7fa6242a..43f867eb 100644
--- a/rdagent/scenarios/kaggle/developer/runner.py
+++ b/rdagent/scenarios/kaggle/developer/runner.py
@@ -15,8 +15,6 @@
     KGModelExperiment,
 )
 
-META_TPL_DIR = Path(__file__).parent.parent / "experiment" / "meta_tpl"
-
 
 class KGCachedRunner(CachedRunner[ASpecificExp]):
     def build_from_SOTA(self, exp: ASpecificExp) -> None:
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
new file mode 100644
index 00000000..a4c0783f
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
@@ -0,0 +1,206 @@
+# TODO: Fix
+import numpy as np # linear algebra
+import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+import re
+
+
+train = pd.read_csv("/kaggle/input/train.csv")
+test = pd.read_csv("/kaggle/input/test.csv")
+submission = pd.read_csv("/kaggle/input/sample_submission.csv")
+
+
+
+features = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar',  'conventions']
+target = train[features]
+
+
+
+
+text_train = train['full_text']
+text_test = test['full_text']
+
+text = pd.concat([text_train, text_test], ignore_index=True)
+
+
+
+count_words = text.str.findall(r'(\w+)').str.len()
+print(count_words.sum())
+
+
+
+""" Cleaning Text """
+text = text.str.lower()
+
+# removing special characters and numbers
+text = text.apply(lambda x : re.sub("[^a-z]\s","",x) )
+
+# remove hash tags
+text = text.str.replace("#", "")
+
+#remove words less than 3 character and greater than 7
+text = text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2 and len(w)<8]))
+
+# removing stopwords
+#text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))
+
+count_words = text.str.findall(r'(\w+)').str.len()
+print(count_words.sum())
+
+
+
+
+
+most_freq_words = pd.Series(' '.join(text).lower().split()).value_counts()[:25]
+text = text.apply(lambda x : " ".join(word for word in x.split() if word not in most_freq_words ))
+
+count_words = text.str.findall(r'(\w+)').str.len()
+
+apostrophe_dict = {
+"ain't": "am not / are not",
+"aren't": "are not / am not",
+"can't": "cannot",
+"can't've": "cannot have",
+"'cause": "because",
+"could've": "could have",
+"couldn't": "could not",
+"couldn't've": "could not have",
+"didn't": "did not",
+"doesn't": "does not",
+"don't": "do not",
+"hadn't": "had not",
+"hadn't've": "had not have",
+"hasn't": "has not",
+"haven't": "have not",
+"he'd": "he had / he would",
+"he'd've": "he would have",
+"he'll": "he shall / he will",
+"he'll've": "he shall have / he will have",
+"he's": "he has / he is",
+"how'd": "how did",
+"how'd'y": "how do you",
+"how'll": "how will",
+"how's": "how has / how is",
+"i'd": "I had / I would",
+"i'd've": "I would have",
+"i'll": "I shall / I will",
+"i'll've": "I shall have / I will have",
+"i'm": "I am",
+"i've": "I have",
+"isn't": "is not",
+"it'd": "it had / it would",
+"it'd've": "it would have",
+"it'll": "it shall / it will",
+"it'll've": "it shall have / it will have",
+"it's": "it has / it is",
+"let's": "let us",
+"ma'am": "madam",
+"mayn't": "may not",
+"might've": "might have",
+"mightn't": "might not",
+"mightn't've": "might not have",
+"must've": "must have",
+"mustn't": "must not",
+"mustn't've": "must not have",
+"needn't": "need not",
+"needn't've": "need not have",
+"o'clock": "of the clock",
+"oughtn't": "ought not",
+"oughtn't've": "ought not have",
+"shan't": "shall not",
+"sha'n't": "shall not",
+"shan't've": "shall not have",
+"she'd": "she had / she would",
+"she'd've": "she would have",
+"she'll": "she shall / she will",
+"she'll've": "she shall have / she will have",
+"she's": "she has / she is",
+"should've": "should have",
+"shouldn't": "should not",
+"shouldn't've": "should not have",
+"so've": "so have",
+"so's": "so as / so is",
+"that'd": "that would / that had",
+"that'd've": "that would have",
+"that's": "that has / that is",
+"there'd": "there had / there would",
+"there'd've": "there would have",
+"there's": "there has / there is",
+"they'd": "they had / they would",
+"they'd've": "they would have",
+"they'll": "they shall / they will",
+"they'll've": "they shall have / they will have",
+"they're": "they are",
+"they've": "they have",
+"to've": "to have",
+"wasn't": "was not",
+"we'd": "we had / we would",
+"we'd've": "we would have",
+"we'll": "we will",
+"we'll've": "we will have",
+"we're": "we are",
+"we've": "we have",
+"weren't": "were not",
+"what'll": "what shall / what will",
+"what'll've": "what shall have / what will have",
+"what're": "what are",
+"what's": "what has / what is",
+"what've": "what have",
+"when's": "when has / when is",
+"when've": "when have",
+"where'd": "where did",
+"where's": "where has / where is",
+"where've": "where have",
+"who'll": "who shall / who will",
+"who'll've": "who shall have / who will have",
+"who's": "who has / who is",
+"who've": "who have",
+"why's": "why has / why is",
+"why've": "why have",
+"will've": "will have",
+"won't": "will not",
+"won't've": "will not have",
+"would've": "would have",
+"wouldn't": "would not",
+"wouldn't've": "would not have",
+"y'all": "you all",
+"y'all'd": "you all would",
+"y'all'd've": "you all would have",
+"y'all're": "you all are",
+"y'all've": "you all have",
+"you'd": "you had / you would",
+"you'd've": "you would have",
+"you'll": "you shall / you will",
+"you'll've": "you shall have / you will have",
+"you're": "you are",
+"you've": "you have"
+}
+
+
+def lookup_dict(txt, dictionary):
+    for word in txt.split():
+        if word.lower() in dictionary:
+            if word.lower() in txt.split():
+                txt = txt.replace(word, dictionary[word.lower()])
+    return txt
+
+
+text = text.apply(lambda x: lookup_dict(x,apostrophe_dict))
+
+# Remove rare words
+from collections import Counter
+from itertools import chain
+
+# split words into lists
+v = text.str.split().tolist() 
+# compute global word frequency
+c = Counter(chain.from_iterable(v))
+# filter, join, and re-assign
+text = [' '.join([j for j in i if c[j] > 1]) for i in v]
+text = pd.Series(text)
+
+total_word = 0
+for x,word in enumerate(text):
+    num_word = len(word.split())
+    #print(num_word)
+    total_word = total_word + num_word
+print(total_word)
\ No newline at end of file
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
new file mode 100644
index 00000000..f8c410bb
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
@@ -0,0 +1,15 @@
+from sklearn.feature_extraction.text import TfidfVectorizer
+import pandas as pd
+import numpy as np
+
+class TfidfFeature:
+    def fit(self, train_df: pd.DataFrame):
+        train_df = np.array(train_df).tolist()
+        train_X = list(map(''.join, train_df))
+        self.model = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=0.01).fit(train_X)
+        # print(self.model.get_feature_names_out()[:5])
+
+    def transform(self, X: pd.DataFrame):
+        X = np.array(X).tolist()
+        X = list(map(''.join, X))
+        return self.model.transform(X)
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
new file mode 100644
index 00000000..487e3a5b
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
@@ -0,0 +1,15 @@
+import pandas as pd
+from sklearn.multioutput import MultiOutputRegressor
+from sklearn.svm import SVR
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    return X
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series):
+    model = MultiOutputRegressor(SVR())
+    model.fit(X_train, y_train)
+    return model
+
+def predict(model: MultiOutputRegressor, X_test: pd.DataFrame):
+    X_test_selected = select(X_test)
+    return model.predict(X_test_selected)
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
new file mode 100644
index 00000000..300f1ae7
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
@@ -0,0 +1,34 @@
+# TODO: fix the train.py
+
+import importlib.util
+from pathlib import Path
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+y = target
+X = text[: len(train)]
+X_test = text[len(train) :]
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_test_f = cls.transform(X_test)
+
+    X_train_l.append(X_train_f)
+    X_test_l.append(X_test_f)
+
+
+submission['cohesion'] = predictions[:,0]
+submission['syntax'] = predictions[:,1]
+submission['vocabulary'] = predictions[:,2]
+submission['phraseology'] = predictions[:,3]
+submission['grammar'] = predictions[:,4]
+submission['conventions'] = predictions[:,5]
+
+submission.to_csv('submission.csv',index=False) # writing data to a CSV file
\ No newline at end of file
diff --git a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
index 99602006..df743a69 100644
--- a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
+++ b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
@@ -11,15 +11,15 @@
     ModelTask,
 )
 from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace
-
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 
 class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl")
+        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template")
 
 
 class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl")
+        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template")
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py
new file mode 100644
index 00000000..90ec0c2a
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py
@@ -0,0 +1,87 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import KFold
+from sklearn.preprocessing import LabelEncoder
+
+from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess
+
+
+def compute_metrics_for_classification(y_true, y_pred):
+    """Compute MCC for classification."""
+    from sklearn.metrics import matthews_corrcoef
+
+    return matthews_corrcoef(y_true, y_pred)
+
+
+def perform_kfold_cross_validation(X, y, n_splits=2, random_seed=42):
+    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
+    fold_metrics = []
+
+    DIRNAME = Path(__file__).absolute().resolve().parent
+
+    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
+        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
+        y_train_fold, y_valid_fold = y[train_idx], y[valid_idx]
+
+        # TODO: Preprocess and Feature Engineering before K-Fold CV
+
+        # Preprocess the data
+        X_train_fold = preprocess(X_train_fold)
+        X_valid_fold = preprocess(X_valid_fold)
+
+        # Feature Engineering
+        X_train_l_fold, X_valid_l_fold = [], []
+        for f in DIRNAME.glob("feat*.py"):
+            m = __import__(f.name.strip(".py"))
+            X_train_fold = m.feat_eng(X_train_fold)
+            X_valid_fold = m.feat_eng(X_valid_fold)
+
+            X_train_l_fold.append(X_train_fold)
+            X_valid_l_fold.append(X_valid_fold)
+
+        X_train_fold = pd.concat(X_train_l_fold, axis=1)
+        X_valid_fold = pd.concat(X_valid_l_fold, axis=1)
+
+        # Align features
+        X_valid_fold = X_valid_fold.reindex(columns=X_train_fold.columns, fill_value=0)
+
+        # Train and evaluate models
+        mcc_scores = []
+        model_l = []  # Reinitialize model list
+        for f in DIRNAME.glob("model*.py"):
+            m = __import__(f.name.strip(".py"))
+            model = m.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)
+            y_valid_pred = m.predict(model, X_valid_fold)
+            mcc = compute_metrics_for_classification(y_valid_fold, y_valid_pred)
+            mcc_scores.append(mcc)
+            print(f"Fold {fold+1}, Model {f.name}: MCC = {mcc}")
+
+        # Store the average MCC score for this fold
+        avg_mcc = np.mean(mcc_scores)
+        fold_metrics.append(avg_mcc)
+        print(f"Fold {fold+1} average MCC: {avg_mcc}")
+
+    # Calculate the overall average MCC
+    overall_avg_mcc = np.mean(fold_metrics)
+    result_df = pd.DataFrame({"Overall Average MCC": [overall_avg_mcc]})
+    result_df.to_csv(f"path/to/playground-series-s4e8/cv_score_{f.name.strip('.py')}.csv", index=False)
+
+    print(f"Overall Average MCC across all folds: {overall_avg_mcc}")
+    return overall_avg_mcc
+
+
+# This allows the script to be run directly
+if __name__ == "__main__":
+    # Load and preprocess the data
+    data_df = pd.read_csv("path/to/playground-series-s4e8/train.csv")
+    data_df = data_df.drop(["id"], axis=1)
+
+    X = data_df.drop(["class"], axis=1)
+    y = data_df[["class"]]
+
+    label_encoder = LabelEncoder()
+    # transfrom y to 1D
+    y = label_encoder.fit_transform(y)
+    result = perform_kfold_cross_validation(X, y)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py
new file mode 100644
index 00000000..4b4ef273
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py
@@ -0,0 +1,111 @@
+import os
+
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+
+
+def prepreprocess():
+    """
+    This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
+    """
+    # Load and preprocess the data
+    data_df = pd.read_csv("/kaggle/input/train.csv")
+    data_df = data_df.drop(["id"], axis=1)
+
+    X = data_df.drop(["class"], axis=1)
+    y = data_df[["class"]]
+
+    label_encoder = LabelEncoder()
+    y = label_encoder.fit_transform(y)  # Convert class labels to numeric
+
+    # Split the data into training and validation sets
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42)
+
+    return X_train, X_valid, y_train, y_valid
+
+
+def preprocess_fit(X_train: pd.DataFrame):
+    """
+    Fits the preprocessor on the training data and returns the fitted preprocessor.
+    """
+    # Identify numerical and categorical features
+    numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]]
+    categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
+
+    # Define preprocessors for numerical and categorical features
+    categorical_transformer = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="most_frequent")),
+            ("onehot", OneHotEncoder(handle_unknown="ignore")),
+        ]
+    )
+
+    numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
+
+    # Combine preprocessing steps
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("cat", categorical_transformer, categorical_cols),
+            ("num", numerical_transformer, numerical_cols),
+        ]
+    )
+
+    # Fit the preprocessor on the training data
+    preprocessor.fit(X_train)
+
+    return preprocessor
+
+
+def preprocess_transform(X: pd.DataFrame, preprocessor):
+    """
+    Transforms the given DataFrame using the fitted preprocessor.
+    Ensures the processed data has consistent features across train, validation, and test sets.
+    """
+    # Transform the data using the fitted preprocessor
+    X_array = preprocessor.transform(X).toarray()
+
+    # Get feature names for the columns in the transformed data
+    categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
+    feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(
+        categorical_cols
+    ).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]]
+
+    # Convert arrays back to DataFrames
+    X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index)
+
+    return X_transformed
+
+
+def preprocess_script():
+    """
+    This method applies the preprocessing steps to the training, validation, and test datasets.
+    """
+    if os.path.exists("X_train.pkl"):
+        X_train = pd.read_pickle("X_train.pkl")
+        X_valid = pd.read_pickle("X_valid.pkl")
+        y_train = pd.read_pickle("y_train.pkl")
+        y_valid = pd.read_pickle("y_valid.pkl")
+        X_test = pd.read_pickle("X_test.pkl")
+        passenger_ids = pd.read_pickle("passenger_ids.pkl")
+
+        return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
+    X_train, X_valid, y_train, y_valid = prepreprocess()
+
+    # Fit the preprocessor on the training data
+    preprocessor = preprocess_fit(X_train)
+
+    # Preprocess the train, validation, and test data
+    X_train = preprocess_transform(X_train, preprocessor)
+    X_valid = preprocess_transform(X_valid, preprocessor)
+
+    # Load and preprocess the test data
+    submission_df = pd.read_csv("/kaggle/input/test.csv")
+    passenger_ids = submission_df["id"]
+    submission_df = submission_df.drop(["id"], axis=1)
+    X_test = preprocess_transform(submission_df, preprocessor)
+
+    return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/feature/feature.py
new file mode 100644
index 00000000..8ae043ac
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/feature/feature.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
+
+
+class IdentityFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_rf.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_rf.py
new file mode 100644
index 00000000..3c64a094
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_rf.py
@@ -0,0 +1,54 @@
+"""
+Motivation of the model:
+The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
+It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
+baseline model for many classification tasks.
+"""
+
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Random Forest model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Random Forest model
+    model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
+
+    # Select features (if any feature selection is needed)
+    X_train_selected = select(X_train)
+    X_valid_selected = select(X_valid)
+
+    # Fit the model
+    model.fit(X_train_selected, y_train)
+
+    # Validate the model
+    y_valid_pred = model.predict(X_valid_selected)
+    accuracy = accuracy_score(y_valid, y_valid_pred)
+    print(f"Validation Accuracy: {accuracy:.4f}")
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Select features (if any feature selection is needed)
+    X_selected = select(X)
+
+    # Predict using the trained model
+    y_pred_prob = model.predict_proba(X_selected)[:, 1]
+
+    # Apply threshold to get boolean predictions
+    return y_pred_prob
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgb.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgb.py
new file mode 100644
index 00000000..56b81c9a
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgb.py
@@ -0,0 +1,40 @@
+"""
+motivation  of the model
+"""
+
+import pandas as pd
+import xgboost as xgb
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    # Ignore feature selection logic
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    """Define and train the model. Merge feature_select"""
+    X_train = select(X_train)
+    X_valid = select(X_valid)
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dvalid = xgb.DMatrix(X_valid, label=y_valid)
+
+    # TODO: for quick running....
+    params = {
+        "nthred": -1,
+    }
+    num_round = 200
+
+    evallist = [(dtrain, "train"), (dvalid, "eval")]
+    bst = xgb.train(params, dtrain, num_round, evallist)
+
+    return bst
+
+
+def predict(model, X):
+    """
+    Keep feature select's consistency.
+    """
+    X = select(X)
+    dtest = xgb.DMatrix(X)
+    y_pred_prob = model.predict(dtest)
+    return y_pred_prob
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
new file mode 100644
index 00000000..cff02620
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
@@ -0,0 +1,121 @@
+import importlib.util
+import random
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import preprocess_script
+from sklearn.metrics import accuracy_score, matthews_corrcoef
+from sklearn.preprocessing import LabelEncoder
+
+# Set random seed for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+# support various method for metrics calculation
+def compute_metrics_for_classification(y_true, y_pred):
+    """Compute accuracy metric for classification."""
+    accuracy = accuracy_score(y_true, y_pred)
+    return accuracy
+
+
+def compute_metrics_for_classification(y_true, y_pred):
+    """Compute MCC for classification."""
+    mcc = matthews_corrcoef(y_true, y_pred)
+    return mcc
+
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# 1) Preprocess the data
+# TODO 如果已经做过数据预处理了，不需要再做了
+X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script()
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
+
+    X_train_l.append(X_train_f)
+    X_valid_l.append(X_valid_f)
+    X_test_l.append(X_test_f)
+
+X_train = pd.concat(X_train_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_train_l))])
+X_valid = pd.concat(X_valid_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_valid_l))])
+X_test = pd.concat(X_test_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_test_l))])
+
+print(X_train.shape, X_valid.shape, X_test.shape)
+
+# Handle inf and -inf values
+X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
+X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
+X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+from sklearn.impute import SimpleImputer
+
+imputer = SimpleImputer(strategy="mean")
+
+X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
+X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
+X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
+
+# Remove duplicate columns
+X_train = X_train.loc[:, ~X_train.columns.duplicated()]
+X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
+X_test = X_test.loc[:, ~X_test.columns.duplicated()]
+
+# 3) Train the model
+model_l = []  # list[tuple[model, predict_func,]]
+for f in DIRNAME.glob("model/model*.py"):
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))
+
+# 4) Evaluate the model on the validation set
+y_valid_pred_l = []
+for model, predict_func in model_l:
+    y_valid_pred_l.append(predict_func(model, X_valid))
+
+# 5) Ensemble
+# TODO: ensemble method in a script
+# Average the predictions and apply a threshold to determine class labels
+y_valid_pred = np.mean(y_valid_pred_l, axis=0)
+y_valid_pred = (y_valid_pred > 0.5).astype(int)
+
+mcc = compute_metrics_for_classification(y_valid, y_valid_pred)
+print("Final on validation set: ", mcc)
+
+# 6) Save the validation accuracy
+pd.Series(data=[mcc], index=["MCC"]).to_csv("submission_score.csv")
+
+# 7) Make predictions on the test set and save them
+label_encoder = LabelEncoder()
+label_encoder.fit(y_train)
+y_test_pred_bool_l = []
+for m, m_pred in model_l:
+    y_test_pred_bool_l.append(
+        m_pred(m, X_test).astype(int)
+    )  # TODO Make this an ensemble. Currently it uses the last prediction
+
+y_test_pred = np.mean(y_test_pred_bool_l, axis=0)
+y_test_pred = (y_test_pred > 0.5).astype(int)
+
+y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)  # 将整数转换回 'e' 或 'p'
+
+submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels})
+
+# 8) Submit predictions for the test set
+submission_result.to_csv("submission.csv", index=False)
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index cebae0e2..0e602262 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -76,7 +76,7 @@ def _analysis_competition_description(self):
     def background(self) -> str:
         background_template = prompt_dict["kg_background"]
 
-        train_script = (Path(__file__).parent / "meta_tpl" / "train.py").read_text()
+        train_script = (Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py").read_text()
 
         background_prompt = (
             Environment(undefined=StrictUndefined)
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
index 000fa3d9..759748b1 100644
--- a/rdagent/scenarios/kaggle/experiment/workspace.py
+++ b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -5,7 +5,6 @@
 import pandas as pd
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
-from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
 from rdagent.core.experiment import FBWorkspace
 from rdagent.log import rdagent_logger as logger
 from rdagent.utils.env import KGDockerEnv
@@ -61,7 +60,7 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
         logger.info(f"Running the experiment in {self.workspace_path}")
 
         # link the data to the workspace to speed up the preprocessing
-        source_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition
+        source_data_path = Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / KAGGLE_IMPLEMENT_SETTING.competition
         self.link_all_files_in_folder_to_workspace(source_data_path, self.workspace_path)
 
         kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
@@ -71,6 +70,9 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
             local_path=str(self.workspace_path),
             entry=f"python train.py",
             env=run_env,
+            running_extra_volume=(
+                {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} if KAGGLE_IMPLEMENT_SETTING.competition else None
+            ),
         )
 
         csv_path = self.workspace_path / "submission_score.csv"
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index 828f65de..d74e2faf 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -1,5 +1,8 @@
 import json
 import time
+import subprocess
+import zipfile
+
 from pathlib import Path
 
 from selenium import webdriver
@@ -7,7 +10,7 @@
 from selenium.webdriver.common.by import By
 
 from rdagent.log import rdagent_logger as logger
-from rdagent.utils.env import KGDockerConf
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 
 options = webdriver.ChromeOptions()
 options.add_argument("--no-sandbox")
@@ -18,7 +21,7 @@
 
 
 def crawl_descriptions(competition: str, wait: float = 3.0, force: bool = False) -> dict[str, str]:
-    if (fp := Path(f"{KGDockerConf().local_data_path}/{competition}.json")).exists() and not force:
+    if (fp := Path(f"{KAGGLE_IMPLEMENT_SETTING.local_data_path}/{competition}.json")).exists() and not force:
         logger.info(f"Found {competition}.json, loading from local file.")
         with fp.open("r") as f:
             return json.load(f)
@@ -62,12 +65,24 @@ def crawl_descriptions(competition: str, wait: float = 3.0, force: bool = False)
     descriptions["Data Description"] = data_element.get_attribute("innerHTML")
 
     driver.quit()
-    with open(f"{KGDockerConf().dockerfile_folder_path}/{competition}.json", "w") as f:
+    with open(f"{KAGGLE_IMPLEMENT_SETTING.local_data_path}/{competition}.json", "w") as f:
         json.dump(descriptions, f)
     return descriptions
 
 
+def download_data(competition: str, local_path: str = "/data/userdata/share/kaggle") -> None:
+    data_path = f"{local_path}/{competition}"
+    if not Path(data_path).exists():
+        subprocess.run(["kaggle", "competitions", "download", "-c", competition, "-p", data_path])
+
+        # unzip data
+        with zipfile.ZipFile(f"{data_path}/{competition}.zip", "r") as zip_ref:
+            zip_ref.extractall(data_path)
+
+
 if __name__ == "__main__":
+    download_data("feedback-prize-english-language-learning", "/data/userdata/share/kaggle")
+    exit()
     from kaggle.api.kaggle_api_extended import KaggleApi
 
     api = KaggleApi()
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
index 36fbe044..5e2524fd 100644
--- a/rdagent/utils/env.py
+++ b/rdagent/utils/env.py
@@ -176,12 +176,12 @@ class Config:
     # image: str = "gcr.io/kaggle-gpu-images/python:latest"
     mount_path: str = "/workspace/kg_workspace/"
     default_entry: str = "python train.py"
-    extra_volumes: dict = {
-        # TODO connect to the place where the data is stored
-        Path("git_ignore_folder/data").resolve(): "/root/.data/"
-    }
+    # extra_volumes: dict = {
+    #     # TODO connect to the place where the data is stored
+    #     Path("git_ignore_folder/data").resolve(): "/root/.data/"
+    # }
 
-    local_data_path: str = "/data/userdata/share/kaggle"
+    # local_data_path: str = "/data/userdata/share/kaggle"
 
 
 # physionet.org/files/mimic-eicu-fiddle-feature/1.0.0/FIDDLE_mimic3
@@ -387,29 +387,3 @@ class KGDockerEnv(DockerEnv):
 
     def __init__(self, competition: str = None, conf: DockerConf = KGDockerConf()):
         super().__init__(conf)
-        self.competition = competition
-
-    def prepare(self):
-        """
-        Download image & data if it doesn't exist
-        """
-        super().prepare()
-
-        # download data, if competition is not provided, the user is targeting a general docker environment in kaggle
-        data_path = f"{self.conf.local_data_path}/{self.competition}"
-        if self.competition is not None and not Path(data_path).exists():
-            subprocess.run(["kaggle", "competitions", "download", "-c", self.competition, "-p", data_path])
-
-            # unzip data
-            with zipfile.ZipFile(f"{data_path}/{self.competition}.zip", "r") as zip_ref:
-                zip_ref.extractall(data_path)
-
-    def run(self, entry: str | None = None, local_path: str | None = None, env: dict | None = None):
-        super().run(
-            entry=entry,
-            local_path=local_path,
-            env=env,
-            running_extra_volume=(
-                {self.conf.local_data_path + "/" + self.competition: "/kaggle/input"} if self.competition else None
-            ),
-        )
diff --git a/test/utils/test_kaggle.py b/test/utils/test_kaggle.py
index a434470d..8419c361 100644
--- a/test/utils/test_kaggle.py
+++ b/test/utils/test_kaggle.py
@@ -1,17 +1,24 @@
 import unittest
 import nbformat
-
+from pathlib import Path
 
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.ret import PythonAgentOut
 from rdagent.utils.agent.tpl import T
 
-
-class TestTplGen(unittest.TestCase):
-    def generate(self, competition: str = "feedback-prize-english-language-learning"):
-        
-        print(competition)
-
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace
+from rich import print
+class TestTpl(unittest.TestCase):
+    def test_competition_template(self):
+        competition = KAGGLE_IMPLEMENT_SETTING.competition
+        print(f"[bold orange]{competition}[/bold orange]")
+        ws = KGFBWorkspace(template_folder_path=Path(__file__).parent.parent.parent / "rdagent/scenarios/kaggle/experiment" / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template")
+        print(ws.workspace_path)
+        ws.execute()
+        success = (ws.workspace_path / "submission.csv").exists()
+        ws.clear()
+        return success
 
 if __name__ == "__main__":
     unittest.main()

From b145d1969c3ddfcad1be031dee83e4e72d8a04f6 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 20 Sep 2024 12:41:54 +0000
Subject: [PATCH 3/4] CI

---
 rdagent/app/kaggle/loop.py                    |   3 +-
 .../fea_share_preprocess.py                   | 282 +++++++++---------
 .../feature/feature.py                        |  11 +-
 .../model/model.py                            |   3 +
 .../train.py                                  |  17 +-
 .../kaggle/experiment/kaggle_experiment.py    |  11 +-
 .../playground-series-s4e8_template/train.py  |   6 +-
 .../scenarios/kaggle/experiment/scenario.py   |   4 +-
 .../scenarios/kaggle/experiment/workspace.py  |   4 +-
 rdagent/scenarios/kaggle/kaggle_crawler.py    |   5 +-
 test/utils/test_kaggle.py                     |  17 +-
 11 files changed, 189 insertions(+), 174 deletions(-)

diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index afb98fb8..c75b4601 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -2,7 +2,7 @@
 from typing import Any
 
 import fire
-from rdagent.scenarios.kaggle.kaggle_crawler import download_data
+
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import RDLoop
@@ -17,6 +17,7 @@
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
     KaggleExperienceBase,
 )
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
index a4c0783f..43b4beb4 100644
--- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
@@ -1,178 +1,170 @@
 # TODO: Fix
-import numpy as np # linear algebra
-import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
 import re
 
+import numpy as np  # linear algebra
+import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
 
 train = pd.read_csv("/kaggle/input/train.csv")
 test = pd.read_csv("/kaggle/input/test.csv")
 submission = pd.read_csv("/kaggle/input/sample_submission.csv")
 
 
-
-features = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar',  'conventions']
+features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
 target = train[features]
 
 
-
-
-text_train = train['full_text']
-text_test = test['full_text']
+text_train = train["full_text"]
+text_test = test["full_text"]
 
 text = pd.concat([text_train, text_test], ignore_index=True)
 
 
-
-count_words = text.str.findall(r'(\w+)').str.len()
+count_words = text.str.findall(r"(\w+)").str.len()
 print(count_words.sum())
 
 
-
 """ Cleaning Text """
 text = text.str.lower()
 
 # removing special characters and numbers
-text = text.apply(lambda x : re.sub("[^a-z]\s","",x) )
+text = text.apply(lambda x: re.sub("[^a-z]\s", "", x))
 
 # remove hash tags
 text = text.str.replace("#", "")
 
-#remove words less than 3 character and greater than 7
-text = text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2 and len(w)<8]))
+# remove words less than 3 character and greater than 7
+text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8]))
 
 # removing stopwords
-#text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))
+# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))
 
-count_words = text.str.findall(r'(\w+)').str.len()
+count_words = text.str.findall(r"(\w+)").str.len()
 print(count_words.sum())
 
 
+most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25]
+text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words))
 
-
-
-most_freq_words = pd.Series(' '.join(text).lower().split()).value_counts()[:25]
-text = text.apply(lambda x : " ".join(word for word in x.split() if word not in most_freq_words ))
-
-count_words = text.str.findall(r'(\w+)').str.len()
+count_words = text.str.findall(r"(\w+)").str.len()
 
 apostrophe_dict = {
-"ain't": "am not / are not",
-"aren't": "are not / am not",
-"can't": "cannot",
-"can't've": "cannot have",
-"'cause": "because",
-"could've": "could have",
-"couldn't": "could not",
-"couldn't've": "could not have",
-"didn't": "did not",
-"doesn't": "does not",
-"don't": "do not",
-"hadn't": "had not",
-"hadn't've": "had not have",
-"hasn't": "has not",
-"haven't": "have not",
-"he'd": "he had / he would",
-"he'd've": "he would have",
-"he'll": "he shall / he will",
-"he'll've": "he shall have / he will have",
-"he's": "he has / he is",
-"how'd": "how did",
-"how'd'y": "how do you",
-"how'll": "how will",
-"how's": "how has / how is",
-"i'd": "I had / I would",
-"i'd've": "I would have",
-"i'll": "I shall / I will",
-"i'll've": "I shall have / I will have",
-"i'm": "I am",
-"i've": "I have",
-"isn't": "is not",
-"it'd": "it had / it would",
-"it'd've": "it would have",
-"it'll": "it shall / it will",
-"it'll've": "it shall have / it will have",
-"it's": "it has / it is",
-"let's": "let us",
-"ma'am": "madam",
-"mayn't": "may not",
-"might've": "might have",
-"mightn't": "might not",
-"mightn't've": "might not have",
-"must've": "must have",
-"mustn't": "must not",
-"mustn't've": "must not have",
-"needn't": "need not",
-"needn't've": "need not have",
-"o'clock": "of the clock",
-"oughtn't": "ought not",
-"oughtn't've": "ought not have",
-"shan't": "shall not",
-"sha'n't": "shall not",
-"shan't've": "shall not have",
-"she'd": "she had / she would",
-"she'd've": "she would have",
-"she'll": "she shall / she will",
-"she'll've": "she shall have / she will have",
-"she's": "she has / she is",
-"should've": "should have",
-"shouldn't": "should not",
-"shouldn't've": "should not have",
-"so've": "so have",
-"so's": "so as / so is",
-"that'd": "that would / that had",
-"that'd've": "that would have",
-"that's": "that has / that is",
-"there'd": "there had / there would",
-"there'd've": "there would have",
-"there's": "there has / there is",
-"they'd": "they had / they would",
-"they'd've": "they would have",
-"they'll": "they shall / they will",
-"they'll've": "they shall have / they will have",
-"they're": "they are",
-"they've": "they have",
-"to've": "to have",
-"wasn't": "was not",
-"we'd": "we had / we would",
-"we'd've": "we would have",
-"we'll": "we will",
-"we'll've": "we will have",
-"we're": "we are",
-"we've": "we have",
-"weren't": "were not",
-"what'll": "what shall / what will",
-"what'll've": "what shall have / what will have",
-"what're": "what are",
-"what's": "what has / what is",
-"what've": "what have",
-"when's": "when has / when is",
-"when've": "when have",
-"where'd": "where did",
-"where's": "where has / where is",
-"where've": "where have",
-"who'll": "who shall / who will",
-"who'll've": "who shall have / who will have",
-"who's": "who has / who is",
-"who've": "who have",
-"why's": "why has / why is",
-"why've": "why have",
-"will've": "will have",
-"won't": "will not",
-"won't've": "will not have",
-"would've": "would have",
-"wouldn't": "would not",
-"wouldn't've": "would not have",
-"y'all": "you all",
-"y'all'd": "you all would",
-"y'all'd've": "you all would have",
-"y'all're": "you all are",
-"y'all've": "you all have",
-"you'd": "you had / you would",
-"you'd've": "you would have",
-"you'll": "you shall / you will",
-"you'll've": "you shall have / you will have",
-"you're": "you are",
-"you've": "you have"
+    "ain't": "am not / are not",
+    "aren't": "are not / am not",
+    "can't": "cannot",
+    "can't've": "cannot have",
+    "'cause": "because",
+    "could've": "could have",
+    "couldn't": "could not",
+    "couldn't've": "could not have",
+    "didn't": "did not",
+    "doesn't": "does not",
+    "don't": "do not",
+    "hadn't": "had not",
+    "hadn't've": "had not have",
+    "hasn't": "has not",
+    "haven't": "have not",
+    "he'd": "he had / he would",
+    "he'd've": "he would have",
+    "he'll": "he shall / he will",
+    "he'll've": "he shall have / he will have",
+    "he's": "he has / he is",
+    "how'd": "how did",
+    "how'd'y": "how do you",
+    "how'll": "how will",
+    "how's": "how has / how is",
+    "i'd": "I had / I would",
+    "i'd've": "I would have",
+    "i'll": "I shall / I will",
+    "i'll've": "I shall have / I will have",
+    "i'm": "I am",
+    "i've": "I have",
+    "isn't": "is not",
+    "it'd": "it had / it would",
+    "it'd've": "it would have",
+    "it'll": "it shall / it will",
+    "it'll've": "it shall have / it will have",
+    "it's": "it has / it is",
+    "let's": "let us",
+    "ma'am": "madam",
+    "mayn't": "may not",
+    "might've": "might have",
+    "mightn't": "might not",
+    "mightn't've": "might not have",
+    "must've": "must have",
+    "mustn't": "must not",
+    "mustn't've": "must not have",
+    "needn't": "need not",
+    "needn't've": "need not have",
+    "o'clock": "of the clock",
+    "oughtn't": "ought not",
+    "oughtn't've": "ought not have",
+    "shan't": "shall not",
+    "sha'n't": "shall not",
+    "shan't've": "shall not have",
+    "she'd": "she had / she would",
+    "she'd've": "she would have",
+    "she'll": "she shall / she will",
+    "she'll've": "she shall have / she will have",
+    "she's": "she has / she is",
+    "should've": "should have",
+    "shouldn't": "should not",
+    "shouldn't've": "should not have",
+    "so've": "so have",
+    "so's": "so as / so is",
+    "that'd": "that would / that had",
+    "that'd've": "that would have",
+    "that's": "that has / that is",
+    "there'd": "there had / there would",
+    "there'd've": "there would have",
+    "there's": "there has / there is",
+    "they'd": "they had / they would",
+    "they'd've": "they would have",
+    "they'll": "they shall / they will",
+    "they'll've": "they shall have / they will have",
+    "they're": "they are",
+    "they've": "they have",
+    "to've": "to have",
+    "wasn't": "was not",
+    "we'd": "we had / we would",
+    "we'd've": "we would have",
+    "we'll": "we will",
+    "we'll've": "we will have",
+    "we're": "we are",
+    "we've": "we have",
+    "weren't": "were not",
+    "what'll": "what shall / what will",
+    "what'll've": "what shall have / what will have",
+    "what're": "what are",
+    "what's": "what has / what is",
+    "what've": "what have",
+    "when's": "when has / when is",
+    "when've": "when have",
+    "where'd": "where did",
+    "where's": "where has / where is",
+    "where've": "where have",
+    "who'll": "who shall / who will",
+    "who'll've": "who shall have / who will have",
+    "who's": "who has / who is",
+    "who've": "who have",
+    "why's": "why has / why is",
+    "why've": "why have",
+    "will've": "will have",
+    "won't": "will not",
+    "won't've": "will not have",
+    "would've": "would have",
+    "wouldn't": "would not",
+    "wouldn't've": "would not have",
+    "y'all": "you all",
+    "y'all'd": "you all would",
+    "y'all'd've": "you all would have",
+    "y'all're": "you all are",
+    "y'all've": "you all have",
+    "you'd": "you had / you would",
+    "you'd've": "you would have",
+    "you'll": "you shall / you will",
+    "you'll've": "you shall have / you will have",
+    "you're": "you are",
+    "you've": "you have",
 }
 
 
@@ -184,23 +176,23 @@ def lookup_dict(txt, dictionary):
     return txt
 
 
-text = text.apply(lambda x: lookup_dict(x,apostrophe_dict))
+text = text.apply(lambda x: lookup_dict(x, apostrophe_dict))
 
 # Remove rare words
 from collections import Counter
 from itertools import chain
 
 # split words into lists
-v = text.str.split().tolist() 
+v = text.str.split().tolist()
 # compute global word frequency
 c = Counter(chain.from_iterable(v))
 # filter, join, and re-assign
-text = [' '.join([j for j in i if c[j] > 1]) for i in v]
+text = [" ".join([j for j in i if c[j] > 1]) for i in v]
 text = pd.Series(text)
 
 total_word = 0
-for x,word in enumerate(text):
+for x, word in enumerate(text):
     num_word = len(word.split())
-    #print(num_word)
+    # print(num_word)
     total_word = total_word + num_word
-print(total_word)
\ No newline at end of file
+print(total_word)
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
index f8c410bb..e43c6fc3 100644
--- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
@@ -1,15 +1,16 @@
-from sklearn.feature_extraction.text import TfidfVectorizer
-import pandas as pd
 import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+
 
 class TfidfFeature:
     def fit(self, train_df: pd.DataFrame):
         train_df = np.array(train_df).tolist()
-        train_X = list(map(''.join, train_df))
-        self.model = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=0.01).fit(train_X)
+        train_X = list(map("".join, train_df))
+        self.model = TfidfVectorizer(stop_words="english", max_df=0.5, min_df=0.01).fit(train_X)
         # print(self.model.get_feature_names_out()[:5])
 
     def transform(self, X: pd.DataFrame):
         X = np.array(X).tolist()
-        X = list(map(''.join, X))
+        X = list(map("".join, X))
         return self.model.transform(X)
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
index 487e3a5b..f0d15b3c 100644
--- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
@@ -2,14 +2,17 @@
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.svm import SVR
 
+
 def select(X: pd.DataFrame) -> pd.DataFrame:
     return X
 
+
 def fit(X_train: pd.DataFrame, y_train: pd.Series):
     model = MultiOutputRegressor(SVR())
     model.fit(X_train, y_train)
     return model
 
+
 def predict(model: MultiOutputRegressor, X_test: pd.DataFrame):
     X_test_selected = select(X_test)
     return model.predict(X_test_selected)
diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
index 300f1ae7..29d957cb 100644
--- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
+++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
@@ -3,11 +3,14 @@
 import importlib.util
 from pathlib import Path
 
+
 def import_module_from_path(module_name, module_path):
     spec = importlib.util.spec_from_file_location(module_name, module_path)
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module
+
+
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 y = target
@@ -24,11 +27,11 @@ def import_module_from_path(module_name, module_path):
     X_test_l.append(X_test_f)
 
 
-submission['cohesion'] = predictions[:,0]
-submission['syntax'] = predictions[:,1]
-submission['vocabulary'] = predictions[:,2]
-submission['phraseology'] = predictions[:,3]
-submission['grammar'] = predictions[:,4]
-submission['conventions'] = predictions[:,5]
+submission["cohesion"] = predictions[:, 0]
+submission["syntax"] = predictions[:, 1]
+submission["vocabulary"] = predictions[:, 2]
+submission["phraseology"] = predictions[:, 3]
+submission["grammar"] = predictions[:, 4]
+submission["conventions"] = predictions[:, 5]
 
-submission.to_csv('submission.csv',index=False) # writing data to a CSV file
\ No newline at end of file
+submission.to_csv("submission.csv", index=False)  # writing data to a CSV file
diff --git a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
index df743a69..b905bf0d 100644
--- a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
+++ b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 from rdagent.components.coder.factor_coder.factor import (
     FactorFBWorkspace,
     FactorTask,
@@ -11,15 +12,19 @@
     ModelTask,
 )
 from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace
-from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+
 
 class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template")
+        self.experiment_workspace = KGFBWorkspace(
+            template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
+        )
 
 
 class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template")
+        self.experiment_workspace = KGFBWorkspace(
+            template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
+        )
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
index cff02620..79975ba7 100644
--- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
@@ -54,9 +54,9 @@ def import_module_from_path(module_name, module_path):
     X_valid_l.append(X_valid_f)
     X_test_l.append(X_test_f)
 
-X_train = pd.concat(X_train_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_train_l))])
-X_valid = pd.concat(X_valid_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_valid_l))])
-X_test = pd.concat(X_test_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_test_l))])
+X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
+X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
+X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
 
 print(X_train.shape, X_valid.shape, X_test.shape)
 
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index 0e602262..e3bb6bb3 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -76,7 +76,9 @@ def _analysis_competition_description(self):
     def background(self) -> str:
         background_template = prompt_dict["kg_background"]
 
-        train_script = (Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py").read_text()
+        train_script = (
+            Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py"
+        ).read_text()
 
         background_prompt = (
             Environment(undefined=StrictUndefined)
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
index 759748b1..537a3cab 100644
--- a/rdagent/scenarios/kaggle/experiment/workspace.py
+++ b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -71,7 +71,9 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
             entry=f"python train.py",
             env=run_env,
             running_extra_volume=(
-                {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} if KAGGLE_IMPLEMENT_SETTING.competition else None
+                {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
+                if KAGGLE_IMPLEMENT_SETTING.competition
+                else None
             ),
         )
 
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index d74e2faf..3cb634f5 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -1,16 +1,15 @@
 import json
-import time
 import subprocess
+import time
 import zipfile
-
 from pathlib import Path
 
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 
-from rdagent.log import rdagent_logger as logger
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.log import rdagent_logger as logger
 
 options = webdriver.ChromeOptions()
 options.add_argument("--no-sandbox")
diff --git a/test/utils/test_kaggle.py b/test/utils/test_kaggle.py
index 8419c361..bd6e2693 100644
--- a/test/utils/test_kaggle.py
+++ b/test/utils/test_kaggle.py
@@ -1,24 +1,31 @@
 import unittest
-import nbformat
 from pathlib import Path
 
+import nbformat
+from rich import print
+
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace
 from rdagent.utils.agent.ret import PythonAgentOut
 from rdagent.utils.agent.tpl import T
 
-from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
-from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace
-from rich import print
+
 class TestTpl(unittest.TestCase):
     def test_competition_template(self):
         competition = KAGGLE_IMPLEMENT_SETTING.competition
         print(f"[bold orange]{competition}[/bold orange]")
-        ws = KGFBWorkspace(template_folder_path=Path(__file__).parent.parent.parent / "rdagent/scenarios/kaggle/experiment" / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template")
+        ws = KGFBWorkspace(
+            template_folder_path=Path(__file__).parent.parent.parent
+            / "rdagent/scenarios/kaggle/experiment"
+            / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
+        )
         print(ws.workspace_path)
         ws.execute()
         success = (ws.workspace_path / "submission.csv").exists()
         ws.clear()
         return success
 
+
 if __name__ == "__main__":
     unittest.main()

From c74ef8c8f4934aca114ab13c42039ab09ce42d14 Mon Sep 17 00:00:00 2001
From: Bowen Xian <xianbowen@outlook.com>
Date: Fri, 20 Sep 2024 12:45:25 +0000
Subject: [PATCH 4/4] fix CI

---
 rdagent/app/kaggle/loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index e0261d29..b250c343 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -17,8 +17,8 @@
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
-from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 from rdagent.log.time import measure_time
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 from rdagent.scenarios.kaggle.proposal.proposal import (
     KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,