From cde7c1f2e32172baf9c9577379061801d9fa3f34 Mon Sep 17 00:00:00 2001 From: Bowen Xian Date: Thu, 19 Sep 2024 09:25:54 +0000 Subject: [PATCH 1/4] add kaggle test --- test/utils/test_kaggle.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 test/utils/test_kaggle.py diff --git a/test/utils/test_kaggle.py b/test/utils/test_kaggle.py new file mode 100644 index 00000000..a434470d --- /dev/null +++ b/test/utils/test_kaggle.py @@ -0,0 +1,17 @@ +import unittest +import nbformat + + +from rdagent.oai.llm_utils import APIBackend +from rdagent.utils.agent.ret import PythonAgentOut +from rdagent.utils.agent.tpl import T + + +class TestTplGen(unittest.TestCase): + def generate(self, competition: str = "feedback-prize-english-language-learning"): + + print(competition) + + +if __name__ == "__main__": + unittest.main() From a7f12f2a2eff7b8c74539fe970e9fb88fc5a419d Mon Sep 17 00:00:00 2001 From: Bowen Xian Date: Fri, 20 Sep 2024 12:40:32 +0000 Subject: [PATCH 2/4] kaggle templates changes --- rdagent/app/kaggle/conf.py | 2 + rdagent/app/kaggle/loop.py | 6 +- rdagent/scenarios/kaggle/developer/runner.py | 2 - .../fea_share_preprocess.py | 206 ++++++++++++++++++ .../feature/feature.py | 15 ++ .../model/model.py | 15 ++ .../train.py | 34 +++ .../kaggle/experiment/kaggle_experiment.py | 6 +- .../cross_validation_tpl.py | 87 ++++++++ .../fea_share_preprocess.py | 111 ++++++++++ .../feature/feature.py | 23 ++ .../model/model_rf.py | 54 +++++ .../model/model_xgb.py | 40 ++++ .../playground-series-s4e8_template/train.py | 121 ++++++++++ .../scenarios/kaggle/experiment/scenario.py | 2 +- .../scenarios/kaggle/experiment/workspace.py | 6 +- rdagent/scenarios/kaggle/kaggle_crawler.py | 21 +- rdagent/utils/env.py | 36 +-- test/utils/test_kaggle.py | 21 +- 19 files changed, 758 insertions(+), 50 deletions(-) create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py create mode 100644 rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/feature/feature.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_rf.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgb.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py index ea5594c1..b1f30fbd 100644 --- a/rdagent/app/kaggle/conf.py +++ b/rdagent/app/kaggle/conf.py @@ -42,6 +42,8 @@ class Config: competition: str = "" + local_data_path: str = "/data/userdata/share/kaggle" + rag_path: str = "git_ignore_folder/rag" diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py index e4fac885..afb98fb8 100644 --- a/rdagent/app/kaggle/loop.py +++ b/rdagent/app/kaggle/loop.py @@ -2,7 +2,7 @@ from typing import Any import fire - +from rdagent.scenarios.kaggle.kaggle_crawler import download_data from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING from rdagent.components.workflow.conf import BasePropSetting from rdagent.components.workflow.rd_loop import RDLoop @@ -88,6 +88,10 @@ def main(path=None, step_n=None, competition=None): """ if competition: KAGGLE_IMPLEMENT_SETTING.competition = competition + download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path) + else: + logger.error("Please specify competition name.") + if path is None: model_loop = ModelRDLoop(KAGGLE_IMPLEMENT_SETTING) else: diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py index 7fa6242a..43f867eb 100644 --- a/rdagent/scenarios/kaggle/developer/runner.py +++ b/rdagent/scenarios/kaggle/developer/runner.py @@ -15,8 +15,6 @@ KGModelExperiment, ) -META_TPL_DIR = Path(__file__).parent.parent / "experiment" / "meta_tpl" - class KGCachedRunner(CachedRunner[ASpecificExp]): def build_from_SOTA(self, exp: ASpecificExp) -> None: diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py new file mode 100644 index 00000000..a4c0783f --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py @@ -0,0 +1,206 @@ +# TODO: Fix +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import re + + +train = pd.read_csv("/kaggle/input/train.csv") +test = pd.read_csv("/kaggle/input/test.csv") +submission = pd.read_csv("/kaggle/input/sample_submission.csv") + + + +features = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'] +target = train[features] + + + + +text_train = train['full_text'] +text_test = test['full_text'] + +text = pd.concat([text_train, text_test], ignore_index=True) + + + +count_words = text.str.findall(r'(\w+)').str.len() +print(count_words.sum()) + + + +""" Cleaning Text """ +text = text.str.lower() + +# removing special characters and numbers +text = text.apply(lambda x : re.sub("[^a-z]\s","",x) ) + +# remove hash tags +text = text.str.replace("#", "") + +#remove words less than 3 character and greater than 7 +text = text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2 and len(w)<8])) + +# removing stopwords +#text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords )) + +count_words = text.str.findall(r'(\w+)').str.len() +print(count_words.sum()) + + + + + +most_freq_words = pd.Series(' '.join(text).lower().split()).value_counts()[:25] +text = text.apply(lambda x : " ".join(word for word in x.split() if word not in most_freq_words )) + +count_words = text.str.findall(r'(\w+)').str.len() + +apostrophe_dict = { +"ain't": "am not / are not", +"aren't": "are not / am not", +"can't": "cannot", +"can't've": "cannot have", +"'cause": "because", +"could've": "could have", +"couldn't": "could not", +"couldn't've": "could not have", +"didn't": "did not", +"doesn't": "does not", +"don't": "do not", +"hadn't": "had not", +"hadn't've": "had not have", +"hasn't": "has not", +"haven't": "have not", +"he'd": "he had / he would", +"he'd've": "he would have", +"he'll": "he shall / he will", +"he'll've": "he shall have / he will have", +"he's": "he has / he is", +"how'd": "how did", +"how'd'y": "how do you", +"how'll": "how will", +"how's": "how has / how is", +"i'd": "I had / I would", +"i'd've": "I would have", +"i'll": "I shall / I will", +"i'll've": "I shall have / I will have", +"i'm": "I am", +"i've": "I have", +"isn't": "is not", +"it'd": "it had / it would", +"it'd've": "it would have", +"it'll": "it shall / it will", +"it'll've": "it shall have / it will have", +"it's": "it has / it is", +"let's": "let us", +"ma'am": "madam", +"mayn't": "may not", +"might've": "might have", +"mightn't": "might not", +"mightn't've": "might not have", +"must've": "must have", +"mustn't": "must not", +"mustn't've": "must not have", +"needn't": "need not", +"needn't've": "need not have", +"o'clock": "of the clock", +"oughtn't": "ought not", +"oughtn't've": "ought not have", +"shan't": "shall not", +"sha'n't": "shall not", +"shan't've": "shall not have", +"she'd": "she had / she would", +"she'd've": "she would have", +"she'll": "she shall / she will", +"she'll've": "she shall have / she will have", +"she's": "she has / she is", +"should've": "should have", +"shouldn't": "should not", +"shouldn't've": "should not have", +"so've": "so have", +"so's": "so as / so is", +"that'd": "that would / that had", +"that'd've": "that would have", +"that's": "that has / that is", +"there'd": "there had / there would", +"there'd've": "there would have", +"there's": "there has / there is", +"they'd": "they had / they would", +"they'd've": "they would have", +"they'll": "they shall / they will", +"they'll've": "they shall have / they will have", +"they're": "they are", +"they've": "they have", +"to've": "to have", +"wasn't": "was not", +"we'd": "we had / we would", +"we'd've": "we would have", +"we'll": "we will", +"we'll've": "we will have", +"we're": "we are", +"we've": "we have", +"weren't": "were not", +"what'll": "what shall / what will", +"what'll've": "what shall have / what will have", +"what're": "what are", +"what's": "what has / what is", +"what've": "what have", +"when's": "when has / when is", +"when've": "when have", +"where'd": "where did", +"where's": "where has / where is", +"where've": "where have", +"who'll": "who shall / who will", +"who'll've": "who shall have / who will have", +"who's": "who has / who is", +"who've": "who have", +"why's": "why has / why is", +"why've": "why have", +"will've": "will have", +"won't": "will not", +"won't've": "will not have", +"would've": "would have", +"wouldn't": "would not", +"wouldn't've": "would not have", +"y'all": "you all", +"y'all'd": "you all would", +"y'all'd've": "you all would have", +"y'all're": "you all are", +"y'all've": "you all have", +"you'd": "you had / you would", +"you'd've": "you would have", +"you'll": "you shall / you will", +"you'll've": "you shall have / you will have", +"you're": "you are", +"you've": "you have" +} + + +def lookup_dict(txt, dictionary): + for word in txt.split(): + if word.lower() in dictionary: + if word.lower() in txt.split(): + txt = txt.replace(word, dictionary[word.lower()]) + return txt + + +text = text.apply(lambda x: lookup_dict(x,apostrophe_dict)) + +# Remove rare words +from collections import Counter +from itertools import chain + +# split words into lists +v = text.str.split().tolist() +# compute global word frequency +c = Counter(chain.from_iterable(v)) +# filter, join, and re-assign +text = [' '.join([j for j in i if c[j] > 1]) for i in v] +text = pd.Series(text) + +total_word = 0 +for x,word in enumerate(text): + num_word = len(word.split()) + #print(num_word) + total_word = total_word + num_word +print(total_word) \ No newline at end of file diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py new file mode 100644 index 00000000..f8c410bb --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py @@ -0,0 +1,15 @@ +from sklearn.feature_extraction.text import TfidfVectorizer +import pandas as pd +import numpy as np + +class TfidfFeature: + def fit(self, train_df: pd.DataFrame): + train_df = np.array(train_df).tolist() + train_X = list(map(''.join, train_df)) + self.model = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=0.01).fit(train_X) + # print(self.model.get_feature_names_out()[:5]) + + def transform(self, X: pd.DataFrame): + X = np.array(X).tolist() + X = list(map(''.join, X)) + return self.model.transform(X) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py new file mode 100644 index 00000000..487e3a5b --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py @@ -0,0 +1,15 @@ +import pandas as pd +from sklearn.multioutput import MultiOutputRegressor +from sklearn.svm import SVR + +def select(X: pd.DataFrame) -> pd.DataFrame: + return X + +def fit(X_train: pd.DataFrame, y_train: pd.Series): + model = MultiOutputRegressor(SVR()) + model.fit(X_train, y_train) + return model + +def predict(model: MultiOutputRegressor, X_test: pd.DataFrame): + X_test_selected = select(X_test) + return model.predict(X_test_selected) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py new file mode 100644 index 00000000..300f1ae7 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py @@ -0,0 +1,34 @@ +# TODO: fix the train.py + +import importlib.util +from pathlib import Path + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module +DIRNAME = Path(__file__).absolute().resolve().parent + +y = target +X = text[: len(train)] +X_test = text[len(train) :] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_test_f = cls.transform(X_test) + + X_train_l.append(X_train_f) + X_test_l.append(X_test_f) + + +submission['cohesion'] = predictions[:,0] +submission['syntax'] = predictions[:,1] +submission['vocabulary'] = predictions[:,2] +submission['phraseology'] = predictions[:,3] +submission['grammar'] = predictions[:,4] +submission['conventions'] = predictions[:,5] + +submission.to_csv('submission.csv',index=False) # writing data to a CSV file \ No newline at end of file diff --git a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py index 99602006..df743a69 100644 --- a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py +++ b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py @@ -11,15 +11,15 @@ ModelTask, ) from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace - +from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl") + self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template") class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl") + self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template") diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py new file mode 100644 index 00000000..90ec0c2a --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py @@ -0,0 +1,87 @@ +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.model_selection import KFold +from sklearn.preprocessing import LabelEncoder + +from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess + + +def compute_metrics_for_classification(y_true, y_pred): + """Compute MCC for classification.""" + from sklearn.metrics import matthews_corrcoef + + return matthews_corrcoef(y_true, y_pred) + + +def perform_kfold_cross_validation(X, y, n_splits=2, random_seed=42): + kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed) + fold_metrics = [] + + DIRNAME = Path(__file__).absolute().resolve().parent + + for fold, (train_idx, valid_idx) in enumerate(kf.split(X)): + X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx] + y_train_fold, y_valid_fold = y[train_idx], y[valid_idx] + + # TODO: Preprocess and Feature Engineering before K-Fold CV + + # Preprocess the data + X_train_fold = preprocess(X_train_fold) + X_valid_fold = preprocess(X_valid_fold) + + # Feature Engineering + X_train_l_fold, X_valid_l_fold = [], [] + for f in DIRNAME.glob("feat*.py"): + m = __import__(f.name.strip(".py")) + X_train_fold = m.feat_eng(X_train_fold) + X_valid_fold = m.feat_eng(X_valid_fold) + + X_train_l_fold.append(X_train_fold) + X_valid_l_fold.append(X_valid_fold) + + X_train_fold = pd.concat(X_train_l_fold, axis=1) + X_valid_fold = pd.concat(X_valid_l_fold, axis=1) + + # Align features + X_valid_fold = X_valid_fold.reindex(columns=X_train_fold.columns, fill_value=0) + + # Train and evaluate models + mcc_scores = [] + model_l = [] # Reinitialize model list + for f in DIRNAME.glob("model*.py"): + m = __import__(f.name.strip(".py")) + model = m.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold) + y_valid_pred = m.predict(model, X_valid_fold) + mcc = compute_metrics_for_classification(y_valid_fold, y_valid_pred) + mcc_scores.append(mcc) + print(f"Fold {fold+1}, Model {f.name}: MCC = {mcc}") + + # Store the average MCC score for this fold + avg_mcc = np.mean(mcc_scores) + fold_metrics.append(avg_mcc) + print(f"Fold {fold+1} average MCC: {avg_mcc}") + + # Calculate the overall average MCC + overall_avg_mcc = np.mean(fold_metrics) + result_df = pd.DataFrame({"Overall Average MCC": [overall_avg_mcc]}) + result_df.to_csv(f"path/to/playground-series-s4e8/cv_score_{f.name.strip('.py')}.csv", index=False) + + print(f"Overall Average MCC across all folds: {overall_avg_mcc}") + return overall_avg_mcc + + +# This allows the script to be run directly +if __name__ == "__main__": + # Load and preprocess the data + data_df = pd.read_csv("path/to/playground-series-s4e8/train.csv") + data_df = data_df.drop(["id"], axis=1) + + X = data_df.drop(["class"], axis=1) + y = data_df[["class"]] + + label_encoder = LabelEncoder() + # transfrom y to 1D + y = label_encoder.fit_transform(y) + result = perform_kfold_cross_validation(X, y) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py new file mode 100644 index 00000000..4b4ef273 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py @@ -0,0 +1,111 @@ +import os + +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder, OneHotEncoder + + +def prepreprocess(): + """ + This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. + """ + # Load and preprocess the data + data_df = pd.read_csv("/kaggle/input/train.csv") + data_df = data_df.drop(["id"], axis=1) + + X = data_df.drop(["class"], axis=1) + y = data_df[["class"]] + + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(y) # Convert class labels to numeric + + # Split the data into training and validation sets + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42) + + return X_train, X_valid, y_train, y_valid + + +def preprocess_fit(X_train: pd.DataFrame): + """ + Fits the preprocessor on the training data and returns the fitted preprocessor. + """ + # Identify numerical and categorical features + numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]] + categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"] + + # Define preprocessors for numerical and categorical features + categorical_transformer = Pipeline( + steps=[ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("onehot", OneHotEncoder(handle_unknown="ignore")), + ] + ) + + numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))]) + + # Combine preprocessing steps + preprocessor = ColumnTransformer( + transformers=[ + ("cat", categorical_transformer, categorical_cols), + ("num", numerical_transformer, numerical_cols), + ] + ) + + # Fit the preprocessor on the training data + preprocessor.fit(X_train) + + return preprocessor + + +def preprocess_transform(X: pd.DataFrame, preprocessor): + """ + Transforms the given DataFrame using the fitted preprocessor. + Ensures the processed data has consistent features across train, validation, and test sets. + """ + # Transform the data using the fitted preprocessor + X_array = preprocessor.transform(X).toarray() + + # Get feature names for the columns in the transformed data + categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"] + feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out( + categorical_cols + ).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]] + + # Convert arrays back to DataFrames + X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index) + + return X_transformed + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("X_train.pkl"): + X_train = pd.read_pickle("X_train.pkl") + X_valid = pd.read_pickle("X_valid.pkl") + y_train = pd.read_pickle("y_train.pkl") + y_valid = pd.read_pickle("y_valid.pkl") + X_test = pd.read_pickle("X_test.pkl") + passenger_ids = pd.read_pickle("passenger_ids.pkl") + + return X_train, X_valid, y_train, y_valid, X_test, passenger_ids + X_train, X_valid, y_train, y_valid = prepreprocess() + + # Fit the preprocessor on the training data + preprocessor = preprocess_fit(X_train) + + # Preprocess the train, validation, and test data + X_train = preprocess_transform(X_train, preprocessor) + X_valid = preprocess_transform(X_valid, preprocessor) + + # Load and preprocess the test data + submission_df = pd.read_csv("/kaggle/input/test.csv") + passenger_ids = submission_df["id"] + submission_df = submission_df.drop(["id"], axis=1) + X_test = preprocess_transform(submission_df, preprocessor) + + return X_train, X_valid, y_train, y_valid, X_test, passenger_ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_rf.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_rf.py new file mode 100644 index 00000000..3c64a094 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_rf.py @@ -0,0 +1,54 @@ +""" +Motivation of the model: +The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality. +It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good +baseline model for many classification tasks. +""" + +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Define and train the Random Forest model. Merge feature selection into the pipeline. + """ + # Initialize the Random Forest model + model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1) + + # Select features (if any feature selection is needed) + X_train_selected = select(X_train) + X_valid_selected = select(X_valid) + + # Fit the model + model.fit(X_train_selected, y_train) + + # Validate the model + y_valid_pred = model.predict(X_valid_selected) + accuracy = accuracy_score(y_valid, y_valid_pred) + print(f"Validation Accuracy: {accuracy:.4f}") + + return model + + +def predict(model, X): + """ + Keep feature selection's consistency and make predictions. + """ + # Select features (if any feature selection is needed) + X_selected = select(X) + + # Predict using the trained model + y_pred_prob = model.predict_proba(X_selected)[:, 1] + + # Apply threshold to get boolean predictions + return y_pred_prob diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgb.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgb.py new file mode 100644 index 00000000..56b81c9a --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgb.py @@ -0,0 +1,40 @@ +""" +motivation of the model +""" + +import pandas as pd +import xgboost as xgb + + +def select(X: pd.DataFrame) -> pd.DataFrame: + # Ignore feature selection logic + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + X_train = select(X_train) + X_valid = select(X_valid) + dtrain = xgb.DMatrix(X_train, label=y_train) + dvalid = xgb.DMatrix(X_valid, label=y_valid) + + # TODO: for quick running.... + params = { + "nthred": -1, + } + num_round = 200 + + evallist = [(dtrain, "train"), (dvalid, "eval")] + bst = xgb.train(params, dtrain, num_round, evallist) + + return bst + + +def predict(model, X): + """ + Keep feature select's consistency. + """ + X = select(X) + dtest = xgb.DMatrix(X) + y_pred_prob = model.predict(dtest) + return y_pred_prob diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py new file mode 100644 index 00000000..cff02620 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py @@ -0,0 +1,121 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import accuracy_score, matthews_corrcoef +from sklearn.preprocessing import LabelEncoder + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +# support various method for metrics calculation +def compute_metrics_for_classification(y_true, y_pred): + """Compute accuracy metric for classification.""" + accuracy = accuracy_score(y_true, y_pred) + return accuracy + + +def compute_metrics_for_classification(y_true, y_pred): + """Compute MCC for classification.""" + mcc = matthews_corrcoef(y_true, y_pred) + return mcc + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +# TODO 如果已经做过数据预处理了,不需要再做了 +X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +X_train = pd.concat(X_train_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_test_l))]) + +print(X_train.shape, X_valid.shape, X_test.shape) + +# Handle inf and -inf values +X_train.replace([np.inf, -np.inf], np.nan, inplace=True) +X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) +X_test.replace([np.inf, -np.inf], np.nan, inplace=True) + +from sklearn.impute import SimpleImputer + +imputer = SimpleImputer(strategy="mean") + +X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) +X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) +X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) + +# Remove duplicate columns +X_train = X_train.loc[:, ~X_train.columns.duplicated()] +X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()] +X_test = X_test.loc[:, ~X_test.columns.duplicated()] + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func,]] +for f in DIRNAME.glob("model/model*.py"): + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) + +# 4) Evaluate the model on the validation set +y_valid_pred_l = [] +for model, predict_func in model_l: + y_valid_pred_l.append(predict_func(model, X_valid)) + +# 5) Ensemble +# TODO: ensemble method in a script +# Average the predictions and apply a threshold to determine class labels +y_valid_pred = np.mean(y_valid_pred_l, axis=0) +y_valid_pred = (y_valid_pred > 0.5).astype(int) + +mcc = compute_metrics_for_classification(y_valid, y_valid_pred) +print("Final on validation set: ", mcc) + +# 6) Save the validation accuracy +pd.Series(data=[mcc], index=["MCC"]).to_csv("submission_score.csv") + +# 7) Make predictions on the test set and save them +label_encoder = LabelEncoder() +label_encoder.fit(y_train) +y_test_pred_bool_l = [] +for m, m_pred in model_l: + y_test_pred_bool_l.append( + m_pred(m, X_test).astype(int) + ) # TODO Make this an ensemble. Currently it uses the last prediction + +y_test_pred = np.mean(y_test_pred_bool_l, axis=0) +y_test_pred = (y_test_pred > 0.5).astype(int) + +y_test_pred_labels = label_encoder.inverse_transform(y_test_pred) # 将整数转换回 'e' 或 'p' + +submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels}) + +# 8) Submit predictions for the test set +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index cebae0e2..0e602262 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -76,7 +76,7 @@ def _analysis_competition_description(self): def background(self) -> str: background_template = prompt_dict["kg_background"] - train_script = (Path(__file__).parent / "meta_tpl" / "train.py").read_text() + train_script = (Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py").read_text() background_prompt = ( Environment(undefined=StrictUndefined) diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py index 000fa3d9..759748b1 100644 --- a/rdagent/scenarios/kaggle/experiment/workspace.py +++ b/rdagent/scenarios/kaggle/experiment/workspace.py @@ -5,7 +5,6 @@ import pandas as pd from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING -from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS from rdagent.core.experiment import FBWorkspace from rdagent.log import rdagent_logger as logger from rdagent.utils.env import KGDockerEnv @@ -61,7 +60,7 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str: logger.info(f"Running the experiment in {self.workspace_path}") # link the data to the workspace to speed up the preprocessing - source_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition + source_data_path = Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / KAGGLE_IMPLEMENT_SETTING.competition self.link_all_files_in_folder_to_workspace(source_data_path, self.workspace_path) kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition) @@ -71,6 +70,9 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str: local_path=str(self.workspace_path), entry=f"python train.py", env=run_env, + running_extra_volume=( + {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} if KAGGLE_IMPLEMENT_SETTING.competition else None + ), ) csv_path = self.workspace_path / "submission_score.csv" diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py index 828f65de..d74e2faf 100644 --- a/rdagent/scenarios/kaggle/kaggle_crawler.py +++ b/rdagent/scenarios/kaggle/kaggle_crawler.py @@ -1,5 +1,8 @@ import json import time +import subprocess +import zipfile + from pathlib import Path from selenium import webdriver @@ -7,7 +10,7 @@ from selenium.webdriver.common.by import By from rdagent.log import rdagent_logger as logger -from rdagent.utils.env import KGDockerConf +from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") @@ -18,7 +21,7 @@ def crawl_descriptions(competition: str, wait: float = 3.0, force: bool = False) -> dict[str, str]: - if (fp := Path(f"{KGDockerConf().local_data_path}/{competition}.json")).exists() and not force: + if (fp := Path(f"{KAGGLE_IMPLEMENT_SETTING.local_data_path}/{competition}.json")).exists() and not force: logger.info(f"Found {competition}.json, loading from local file.") with fp.open("r") as f: return json.load(f) @@ -62,12 +65,24 @@ def crawl_descriptions(competition: str, wait: float = 3.0, force: bool = False) descriptions["Data Description"] = data_element.get_attribute("innerHTML") driver.quit() - with open(f"{KGDockerConf().dockerfile_folder_path}/{competition}.json", "w") as f: + with open(f"{KAGGLE_IMPLEMENT_SETTING.local_data_path}/{competition}.json", "w") as f: json.dump(descriptions, f) return descriptions +def download_data(competition: str, local_path: str = "/data/userdata/share/kaggle") -> None: + data_path = f"{local_path}/{competition}" + if not Path(data_path).exists(): + subprocess.run(["kaggle", "competitions", "download", "-c", competition, "-p", data_path]) + + # unzip data + with zipfile.ZipFile(f"{data_path}/{competition}.zip", "r") as zip_ref: + zip_ref.extractall(data_path) + + if __name__ == "__main__": + download_data("feedback-prize-english-language-learning", "/data/userdata/share/kaggle") + exit() from kaggle.api.kaggle_api_extended import KaggleApi api = KaggleApi() diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py index 36fbe044..5e2524fd 100644 --- a/rdagent/utils/env.py +++ b/rdagent/utils/env.py @@ -176,12 +176,12 @@ class Config: # image: str = "gcr.io/kaggle-gpu-images/python:latest" mount_path: str = "/workspace/kg_workspace/" default_entry: str = "python train.py" - extra_volumes: dict = { - # TODO connect to the place where the data is stored - Path("git_ignore_folder/data").resolve(): "/root/.data/" - } + # extra_volumes: dict = { + # # TODO connect to the place where the data is stored + # Path("git_ignore_folder/data").resolve(): "/root/.data/" + # } - local_data_path: str = "/data/userdata/share/kaggle" + # local_data_path: str = "/data/userdata/share/kaggle" # physionet.org/files/mimic-eicu-fiddle-feature/1.0.0/FIDDLE_mimic3 @@ -387,29 +387,3 @@ class KGDockerEnv(DockerEnv): def __init__(self, competition: str = None, conf: DockerConf = KGDockerConf()): super().__init__(conf) - self.competition = competition - - def prepare(self): - """ - Download image & data if it doesn't exist - """ - super().prepare() - - # download data, if competition is not provided, the user is targeting a general docker environment in kaggle - data_path = f"{self.conf.local_data_path}/{self.competition}" - if self.competition is not None and not Path(data_path).exists(): - subprocess.run(["kaggle", "competitions", "download", "-c", self.competition, "-p", data_path]) - - # unzip data - with zipfile.ZipFile(f"{data_path}/{self.competition}.zip", "r") as zip_ref: - zip_ref.extractall(data_path) - - def run(self, entry: str | None = None, local_path: str | None = None, env: dict | None = None): - super().run( - entry=entry, - local_path=local_path, - env=env, - running_extra_volume=( - {self.conf.local_data_path + "/" + self.competition: "/kaggle/input"} if self.competition else None - ), - ) diff --git a/test/utils/test_kaggle.py b/test/utils/test_kaggle.py index a434470d..8419c361 100644 --- a/test/utils/test_kaggle.py +++ b/test/utils/test_kaggle.py @@ -1,17 +1,24 @@ import unittest import nbformat - +from pathlib import Path from rdagent.oai.llm_utils import APIBackend from rdagent.utils.agent.ret import PythonAgentOut from rdagent.utils.agent.tpl import T - -class TestTplGen(unittest.TestCase): - def generate(self, competition: str = "feedback-prize-english-language-learning"): - - print(competition) - +from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING +from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace +from rich import print +class TestTpl(unittest.TestCase): + def test_competition_template(self): + competition = KAGGLE_IMPLEMENT_SETTING.competition + print(f"[bold orange]{competition}[/bold orange]") + ws = KGFBWorkspace(template_folder_path=Path(__file__).parent.parent.parent / "rdagent/scenarios/kaggle/experiment" / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template") + print(ws.workspace_path) + ws.execute() + success = (ws.workspace_path / "submission.csv").exists() + ws.clear() + return success if __name__ == "__main__": unittest.main() From b145d1969c3ddfcad1be031dee83e4e72d8a04f6 Mon Sep 17 00:00:00 2001 From: Bowen Xian Date: Fri, 20 Sep 2024 12:41:54 +0000 Subject: [PATCH 3/4] CI --- rdagent/app/kaggle/loop.py | 3 +- .../fea_share_preprocess.py | 282 +++++++++--------- .../feature/feature.py | 11 +- .../model/model.py | 3 + .../train.py | 17 +- .../kaggle/experiment/kaggle_experiment.py | 11 +- .../playground-series-s4e8_template/train.py | 6 +- .../scenarios/kaggle/experiment/scenario.py | 4 +- .../scenarios/kaggle/experiment/workspace.py | 4 +- rdagent/scenarios/kaggle/kaggle_crawler.py | 5 +- test/utils/test_kaggle.py | 17 +- 11 files changed, 189 insertions(+), 174 deletions(-) diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py index afb98fb8..c75b4601 100644 --- a/rdagent/app/kaggle/loop.py +++ b/rdagent/app/kaggle/loop.py @@ -2,7 +2,7 @@ from typing import Any import fire -from rdagent.scenarios.kaggle.kaggle_crawler import download_data + from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING from rdagent.components.workflow.conf import BasePropSetting from rdagent.components.workflow.rd_loop import RDLoop @@ -17,6 +17,7 @@ from rdagent.core.scenario import Scenario from rdagent.core.utils import import_class from rdagent.log import rdagent_logger as logger +from rdagent.scenarios.kaggle.kaggle_crawler import download_data from rdagent.scenarios.kaggle.knowledge_management.vector_base import ( KaggleExperienceBase, ) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py index a4c0783f..43b4beb4 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py @@ -1,178 +1,170 @@ # TODO: Fix -import numpy as np # linear algebra -import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import re +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) train = pd.read_csv("/kaggle/input/train.csv") test = pd.read_csv("/kaggle/input/test.csv") submission = pd.read_csv("/kaggle/input/sample_submission.csv") - -features = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'] +features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"] target = train[features] - - -text_train = train['full_text'] -text_test = test['full_text'] +text_train = train["full_text"] +text_test = test["full_text"] text = pd.concat([text_train, text_test], ignore_index=True) - -count_words = text.str.findall(r'(\w+)').str.len() +count_words = text.str.findall(r"(\w+)").str.len() print(count_words.sum()) - """ Cleaning Text """ text = text.str.lower() # removing special characters and numbers -text = text.apply(lambda x : re.sub("[^a-z]\s","",x) ) +text = text.apply(lambda x: re.sub("[^a-z]\s", "", x)) # remove hash tags text = text.str.replace("#", "") -#remove words less than 3 character and greater than 7 -text = text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2 and len(w)<8])) +# remove words less than 3 character and greater than 7 +text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8])) # removing stopwords -#text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords )) +# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords )) -count_words = text.str.findall(r'(\w+)').str.len() +count_words = text.str.findall(r"(\w+)").str.len() print(count_words.sum()) +most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25] +text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words)) - - -most_freq_words = pd.Series(' '.join(text).lower().split()).value_counts()[:25] -text = text.apply(lambda x : " ".join(word for word in x.split() if word not in most_freq_words )) - -count_words = text.str.findall(r'(\w+)').str.len() +count_words = text.str.findall(r"(\w+)").str.len() apostrophe_dict = { -"ain't": "am not / are not", -"aren't": "are not / am not", -"can't": "cannot", -"can't've": "cannot have", -"'cause": "because", -"could've": "could have", -"couldn't": "could not", -"couldn't've": "could not have", -"didn't": "did not", -"doesn't": "does not", -"don't": "do not", -"hadn't": "had not", -"hadn't've": "had not have", -"hasn't": "has not", -"haven't": "have not", -"he'd": "he had / he would", -"he'd've": "he would have", -"he'll": "he shall / he will", -"he'll've": "he shall have / he will have", -"he's": "he has / he is", -"how'd": "how did", -"how'd'y": "how do you", -"how'll": "how will", -"how's": "how has / how is", -"i'd": "I had / I would", -"i'd've": "I would have", -"i'll": "I shall / I will", -"i'll've": "I shall have / I will have", -"i'm": "I am", -"i've": "I have", -"isn't": "is not", -"it'd": "it had / it would", -"it'd've": "it would have", -"it'll": "it shall / it will", -"it'll've": "it shall have / it will have", -"it's": "it has / it is", -"let's": "let us", -"ma'am": "madam", -"mayn't": "may not", -"might've": "might have", -"mightn't": "might not", -"mightn't've": "might not have", -"must've": "must have", -"mustn't": "must not", -"mustn't've": "must not have", -"needn't": "need not", -"needn't've": "need not have", -"o'clock": "of the clock", -"oughtn't": "ought not", -"oughtn't've": "ought not have", -"shan't": "shall not", -"sha'n't": "shall not", -"shan't've": "shall not have", -"she'd": "she had / she would", -"she'd've": "she would have", -"she'll": "she shall / she will", -"she'll've": "she shall have / she will have", -"she's": "she has / she is", -"should've": "should have", -"shouldn't": "should not", -"shouldn't've": "should not have", -"so've": "so have", -"so's": "so as / so is", -"that'd": "that would / that had", -"that'd've": "that would have", -"that's": "that has / that is", -"there'd": "there had / there would", -"there'd've": "there would have", -"there's": "there has / there is", -"they'd": "they had / they would", -"they'd've": "they would have", -"they'll": "they shall / they will", -"they'll've": "they shall have / they will have", -"they're": "they are", -"they've": "they have", -"to've": "to have", -"wasn't": "was not", -"we'd": "we had / we would", -"we'd've": "we would have", -"we'll": "we will", -"we'll've": "we will have", -"we're": "we are", -"we've": "we have", -"weren't": "were not", -"what'll": "what shall / what will", -"what'll've": "what shall have / what will have", -"what're": "what are", -"what's": "what has / what is", -"what've": "what have", -"when's": "when has / when is", -"when've": "when have", -"where'd": "where did", -"where's": "where has / where is", -"where've": "where have", -"who'll": "who shall / who will", -"who'll've": "who shall have / who will have", -"who's": "who has / who is", -"who've": "who have", -"why's": "why has / why is", -"why've": "why have", -"will've": "will have", -"won't": "will not", -"won't've": "will not have", -"would've": "would have", -"wouldn't": "would not", -"wouldn't've": "would not have", -"y'all": "you all", -"y'all'd": "you all would", -"y'all'd've": "you all would have", -"y'all're": "you all are", -"y'all've": "you all have", -"you'd": "you had / you would", -"you'd've": "you would have", -"you'll": "you shall / you will", -"you'll've": "you shall have / you will have", -"you're": "you are", -"you've": "you have" + "ain't": "am not / are not", + "aren't": "are not / am not", + "can't": "cannot", + "can't've": "cannot have", + "'cause": "because", + "could've": "could have", + "couldn't": "could not", + "couldn't've": "could not have", + "didn't": "did not", + "doesn't": "does not", + "don't": "do not", + "hadn't": "had not", + "hadn't've": "had not have", + "hasn't": "has not", + "haven't": "have not", + "he'd": "he had / he would", + "he'd've": "he would have", + "he'll": "he shall / he will", + "he'll've": "he shall have / he will have", + "he's": "he has / he is", + "how'd": "how did", + "how'd'y": "how do you", + "how'll": "how will", + "how's": "how has / how is", + "i'd": "I had / I would", + "i'd've": "I would have", + "i'll": "I shall / I will", + "i'll've": "I shall have / I will have", + "i'm": "I am", + "i've": "I have", + "isn't": "is not", + "it'd": "it had / it would", + "it'd've": "it would have", + "it'll": "it shall / it will", + "it'll've": "it shall have / it will have", + "it's": "it has / it is", + "let's": "let us", + "ma'am": "madam", + "mayn't": "may not", + "might've": "might have", + "mightn't": "might not", + "mightn't've": "might not have", + "must've": "must have", + "mustn't": "must not", + "mustn't've": "must not have", + "needn't": "need not", + "needn't've": "need not have", + "o'clock": "of the clock", + "oughtn't": "ought not", + "oughtn't've": "ought not have", + "shan't": "shall not", + "sha'n't": "shall not", + "shan't've": "shall not have", + "she'd": "she had / she would", + "she'd've": "she would have", + "she'll": "she shall / she will", + "she'll've": "she shall have / she will have", + "she's": "she has / she is", + "should've": "should have", + "shouldn't": "should not", + "shouldn't've": "should not have", + "so've": "so have", + "so's": "so as / so is", + "that'd": "that would / that had", + "that'd've": "that would have", + "that's": "that has / that is", + "there'd": "there had / there would", + "there'd've": "there would have", + "there's": "there has / there is", + "they'd": "they had / they would", + "they'd've": "they would have", + "they'll": "they shall / they will", + "they'll've": "they shall have / they will have", + "they're": "they are", + "they've": "they have", + "to've": "to have", + "wasn't": "was not", + "we'd": "we had / we would", + "we'd've": "we would have", + "we'll": "we will", + "we'll've": "we will have", + "we're": "we are", + "we've": "we have", + "weren't": "were not", + "what'll": "what shall / what will", + "what'll've": "what shall have / what will have", + "what're": "what are", + "what's": "what has / what is", + "what've": "what have", + "when's": "when has / when is", + "when've": "when have", + "where'd": "where did", + "where's": "where has / where is", + "where've": "where have", + "who'll": "who shall / who will", + "who'll've": "who shall have / who will have", + "who's": "who has / who is", + "who've": "who have", + "why's": "why has / why is", + "why've": "why have", + "will've": "will have", + "won't": "will not", + "won't've": "will not have", + "would've": "would have", + "wouldn't": "would not", + "wouldn't've": "would not have", + "y'all": "you all", + "y'all'd": "you all would", + "y'all'd've": "you all would have", + "y'all're": "you all are", + "y'all've": "you all have", + "you'd": "you had / you would", + "you'd've": "you would have", + "you'll": "you shall / you will", + "you'll've": "you shall have / you will have", + "you're": "you are", + "you've": "you have", } @@ -184,23 +176,23 @@ def lookup_dict(txt, dictionary): return txt -text = text.apply(lambda x: lookup_dict(x,apostrophe_dict)) +text = text.apply(lambda x: lookup_dict(x, apostrophe_dict)) # Remove rare words from collections import Counter from itertools import chain # split words into lists -v = text.str.split().tolist() +v = text.str.split().tolist() # compute global word frequency c = Counter(chain.from_iterable(v)) # filter, join, and re-assign -text = [' '.join([j for j in i if c[j] > 1]) for i in v] +text = [" ".join([j for j in i if c[j] > 1]) for i in v] text = pd.Series(text) total_word = 0 -for x,word in enumerate(text): +for x, word in enumerate(text): num_word = len(word.split()) - #print(num_word) + # print(num_word) total_word = total_word + num_word -print(total_word) \ No newline at end of file +print(total_word) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py index f8c410bb..e43c6fc3 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py @@ -1,15 +1,16 @@ -from sklearn.feature_extraction.text import TfidfVectorizer -import pandas as pd import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer + class TfidfFeature: def fit(self, train_df: pd.DataFrame): train_df = np.array(train_df).tolist() - train_X = list(map(''.join, train_df)) - self.model = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=0.01).fit(train_X) + train_X = list(map("".join, train_df)) + self.model = TfidfVectorizer(stop_words="english", max_df=0.5, min_df=0.01).fit(train_X) # print(self.model.get_feature_names_out()[:5]) def transform(self, X: pd.DataFrame): X = np.array(X).tolist() - X = list(map(''.join, X)) + X = list(map("".join, X)) return self.model.transform(X) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py index 487e3a5b..f0d15b3c 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py @@ -2,14 +2,17 @@ from sklearn.multioutput import MultiOutputRegressor from sklearn.svm import SVR + def select(X: pd.DataFrame) -> pd.DataFrame: return X + def fit(X_train: pd.DataFrame, y_train: pd.Series): model = MultiOutputRegressor(SVR()) model.fit(X_train, y_train) return model + def predict(model: MultiOutputRegressor, X_test: pd.DataFrame): X_test_selected = select(X_test) return model.predict(X_test_selected) diff --git a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py index 300f1ae7..29d957cb 100644 --- a/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py @@ -3,11 +3,14 @@ import importlib.util from pathlib import Path + def import_module_from_path(module_name, module_path): spec = importlib.util.spec_from_file_location(module_name, module_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module + + DIRNAME = Path(__file__).absolute().resolve().parent y = target @@ -24,11 +27,11 @@ def import_module_from_path(module_name, module_path): X_test_l.append(X_test_f) -submission['cohesion'] = predictions[:,0] -submission['syntax'] = predictions[:,1] -submission['vocabulary'] = predictions[:,2] -submission['phraseology'] = predictions[:,3] -submission['grammar'] = predictions[:,4] -submission['conventions'] = predictions[:,5] +submission["cohesion"] = predictions[:, 0] +submission["syntax"] = predictions[:, 1] +submission["vocabulary"] = predictions[:, 2] +submission["phraseology"] = predictions[:, 3] +submission["grammar"] = predictions[:, 4] +submission["conventions"] = predictions[:, 5] -submission.to_csv('submission.csv',index=False) # writing data to a CSV file \ No newline at end of file +submission.to_csv("submission.csv", index=False) # writing data to a CSV file diff --git a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py index df743a69..b905bf0d 100644 --- a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py +++ b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py @@ -1,5 +1,6 @@ from pathlib import Path +from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING from rdagent.components.coder.factor_coder.factor import ( FactorFBWorkspace, FactorTask, @@ -11,15 +12,19 @@ ModelTask, ) from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace -from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING + class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template") + self.experiment_workspace = KGFBWorkspace( + template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" + ) class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template") + self.experiment_workspace = KGFBWorkspace( + template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" + ) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py index cff02620..79975ba7 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py @@ -54,9 +54,9 @@ def import_module_from_path(module_name, module_path): X_valid_l.append(X_valid_f) X_test_l.append(X_test_f) -X_train = pd.concat(X_train_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_train_l))]) -X_valid = pd.concat(X_valid_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_valid_l))]) -X_test = pd.concat(X_test_l, axis=1, keys=[f'feature_{i}' for i in range(len(X_test_l))]) +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) print(X_train.shape, X_valid.shape, X_test.shape) diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index 0e602262..e3bb6bb3 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -76,7 +76,9 @@ def _analysis_competition_description(self): def background(self) -> str: background_template = prompt_dict["kg_background"] - train_script = (Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py").read_text() + train_script = ( + Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py" + ).read_text() background_prompt = ( Environment(undefined=StrictUndefined) diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py index 759748b1..537a3cab 100644 --- a/rdagent/scenarios/kaggle/experiment/workspace.py +++ b/rdagent/scenarios/kaggle/experiment/workspace.py @@ -71,7 +71,9 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str: entry=f"python train.py", env=run_env, running_extra_volume=( - {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} if KAGGLE_IMPLEMENT_SETTING.competition else None + {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} + if KAGGLE_IMPLEMENT_SETTING.competition + else None ), ) diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py index d74e2faf..3cb634f5 100644 --- a/rdagent/scenarios/kaggle/kaggle_crawler.py +++ b/rdagent/scenarios/kaggle/kaggle_crawler.py @@ -1,16 +1,15 @@ import json -import time import subprocess +import time import zipfile - from pathlib import Path from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By -from rdagent.log import rdagent_logger as logger from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING +from rdagent.log import rdagent_logger as logger options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") diff --git a/test/utils/test_kaggle.py b/test/utils/test_kaggle.py index 8419c361..bd6e2693 100644 --- a/test/utils/test_kaggle.py +++ b/test/utils/test_kaggle.py @@ -1,24 +1,31 @@ import unittest -import nbformat from pathlib import Path +import nbformat +from rich import print + +from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING from rdagent.oai.llm_utils import APIBackend +from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace from rdagent.utils.agent.ret import PythonAgentOut from rdagent.utils.agent.tpl import T -from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING -from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace -from rich import print + class TestTpl(unittest.TestCase): def test_competition_template(self): competition = KAGGLE_IMPLEMENT_SETTING.competition print(f"[bold orange]{competition}[/bold orange]") - ws = KGFBWorkspace(template_folder_path=Path(__file__).parent.parent.parent / "rdagent/scenarios/kaggle/experiment" / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template") + ws = KGFBWorkspace( + template_folder_path=Path(__file__).parent.parent.parent + / "rdagent/scenarios/kaggle/experiment" + / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" + ) print(ws.workspace_path) ws.execute() success = (ws.workspace_path / "submission.csv").exists() ws.clear() return success + if __name__ == "__main__": unittest.main() From c74ef8c8f4934aca114ab13c42039ab09ce42d14 Mon Sep 17 00:00:00 2001 From: Bowen Xian Date: Fri, 20 Sep 2024 12:45:25 +0000 Subject: [PATCH 4/4] fix CI --- rdagent/app/kaggle/loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py index e0261d29..b250c343 100644 --- a/rdagent/app/kaggle/loop.py +++ b/rdagent/app/kaggle/loop.py @@ -17,8 +17,8 @@ from rdagent.core.scenario import Scenario from rdagent.core.utils import import_class from rdagent.log import rdagent_logger as logger -from rdagent.scenarios.kaggle.kaggle_crawler import download_data from rdagent.log.time import measure_time +from rdagent.scenarios.kaggle.kaggle_crawler import download_data from rdagent.scenarios.kaggle.proposal.proposal import ( KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING,