feat: kaggle templates related (#287)

* add kaggle test * kaggle templates changes
microsoft · Sep 20, 2024 · 785fdc1 · 785fdc1
1 parent 77966c4
commit 785fdc1
Show file tree

Hide file tree

Showing 19 changed files with 781 additions and 41 deletions.
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
@@ -42,6 +42,8 @@ class Config:
 
     competition: str = ""
 
+    local_data_path: str = "/data/userdata/share/kaggle"
+
     rag_path: str = "git_ignore_folder/rag"
 
 

diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -18,6 +18,7 @@
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
 from rdagent.log.time import measure_time
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 from rdagent.scenarios.kaggle.proposal.proposal import (
     KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,
@@ -89,6 +90,10 @@ def main(path=None, step_n=None, competition=None):
     """
     if competition:
         KAGGLE_IMPLEMENT_SETTING.competition = competition
+        download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
+    else:
+        logger.error("Please specify competition name.")
+
     if path is None:
         kaggle_loop = KaggleRDLoop(KAGGLE_IMPLEMENT_SETTING)
     else:

diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
@@ -15,8 +15,6 @@
     KGModelExperiment,
 )
 
-META_TPL_DIR = Path(__file__).parent.parent / "experiment" / "meta_tpl"
-
 
 class KGCachedRunner(CachedRunner[ASpecificExp]):
     def build_from_SOTA(self, exp: ASpecificExp) -> None:

diff --git a/...ggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py b/...ggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
@@ -0,0 +1,198 @@
+# TODO: Fix
+import re
+
+import numpy as np  # linear algebra
+import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
+
+train = pd.read_csv("/kaggle/input/train.csv")
+test = pd.read_csv("/kaggle/input/test.csv")
+submission = pd.read_csv("/kaggle/input/sample_submission.csv")
+
+
+features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
+target = train[features]
+
+
+text_train = train["full_text"]
+text_test = test["full_text"]
+
+text = pd.concat([text_train, text_test], ignore_index=True)
+
+
+count_words = text.str.findall(r"(\w+)").str.len()
+print(count_words.sum())
+
+
+""" Cleaning Text """
+text = text.str.lower()
+
+# removing special characters and numbers
+text = text.apply(lambda x: re.sub("[^a-z]\s", "", x))
+
+# remove hash tags
+text = text.str.replace("#", "")
+
+# remove words less than 3 character and greater than 7
+text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8]))
+
+# removing stopwords
+# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))
+
+count_words = text.str.findall(r"(\w+)").str.len()
+print(count_words.sum())
+
+
+most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25]
+text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words))
+
+count_words = text.str.findall(r"(\w+)").str.len()
+
+apostrophe_dict = {
+    "ain't": "am not / are not",
+    "aren't": "are not / am not",
+    "can't": "cannot",
+    "can't've": "cannot have",
+    "'cause": "because",
+    "could've": "could have",
+    "couldn't": "could not",
+    "couldn't've": "could not have",
+    "didn't": "did not",
+    "doesn't": "does not",
+    "don't": "do not",
+    "hadn't": "had not",
+    "hadn't've": "had not have",
+    "hasn't": "has not",
+    "haven't": "have not",
+    "he'd": "he had / he would",
+    "he'd've": "he would have",
+    "he'll": "he shall / he will",
+    "he'll've": "he shall have / he will have",
+    "he's": "he has / he is",
+    "how'd": "how did",
+    "how'd'y": "how do you",
+    "how'll": "how will",
+    "how's": "how has / how is",
+    "i'd": "I had / I would",
+    "i'd've": "I would have",
+    "i'll": "I shall / I will",
+    "i'll've": "I shall have / I will have",
+    "i'm": "I am",
+    "i've": "I have",
+    "isn't": "is not",
+    "it'd": "it had / it would",
+    "it'd've": "it would have",
+    "it'll": "it shall / it will",
+    "it'll've": "it shall have / it will have",
+    "it's": "it has / it is",
+    "let's": "let us",
+    "ma'am": "madam",
+    "mayn't": "may not",
+    "might've": "might have",
+    "mightn't": "might not",
+    "mightn't've": "might not have",
+    "must've": "must have",
+    "mustn't": "must not",
+    "mustn't've": "must not have",
+    "needn't": "need not",
+    "needn't've": "need not have",
+    "o'clock": "of the clock",
+    "oughtn't": "ought not",
+    "oughtn't've": "ought not have",
+    "shan't": "shall not",
+    "sha'n't": "shall not",
+    "shan't've": "shall not have",
+    "she'd": "she had / she would",
+    "she'd've": "she would have",
+    "she'll": "she shall / she will",
+    "she'll've": "she shall have / she will have",
+    "she's": "she has / she is",
+    "should've": "should have",
+    "shouldn't": "should not",
+    "shouldn't've": "should not have",
+    "so've": "so have",
+    "so's": "so as / so is",
+    "that'd": "that would / that had",
+    "that'd've": "that would have",
+    "that's": "that has / that is",
+    "there'd": "there had / there would",
+    "there'd've": "there would have",
+    "there's": "there has / there is",
+    "they'd": "they had / they would",
+    "they'd've": "they would have",
+    "they'll": "they shall / they will",
+    "they'll've": "they shall have / they will have",
+    "they're": "they are",
+    "they've": "they have",
+    "to've": "to have",
+    "wasn't": "was not",
+    "we'd": "we had / we would",
+    "we'd've": "we would have",
+    "we'll": "we will",
+    "we'll've": "we will have",
+    "we're": "we are",
+    "we've": "we have",
+    "weren't": "were not",
+    "what'll": "what shall / what will",
+    "what'll've": "what shall have / what will have",
+    "what're": "what are",
+    "what's": "what has / what is",
+    "what've": "what have",
+    "when's": "when has / when is",
+    "when've": "when have",
+    "where'd": "where did",
+    "where's": "where has / where is",
+    "where've": "where have",
+    "who'll": "who shall / who will",
+    "who'll've": "who shall have / who will have",
+    "who's": "who has / who is",
+    "who've": "who have",
+    "why's": "why has / why is",
+    "why've": "why have",
+    "will've": "will have",
+    "won't": "will not",
+    "won't've": "will not have",
+    "would've": "would have",
+    "wouldn't": "would not",
+    "wouldn't've": "would not have",
+    "y'all": "you all",
+    "y'all'd": "you all would",
+    "y'all'd've": "you all would have",
+    "y'all're": "you all are",
+    "y'all've": "you all have",
+    "you'd": "you had / you would",
+    "you'd've": "you would have",
+    "you'll": "you shall / you will",
+    "you'll've": "you shall have / you will have",
+    "you're": "you are",
+    "you've": "you have",
+}
+
+
+def lookup_dict(txt, dictionary):
+    for word in txt.split():
+        if word.lower() in dictionary:
+            if word.lower() in txt.split():
+                txt = txt.replace(word, dictionary[word.lower()])
+    return txt
+
+
+text = text.apply(lambda x: lookup_dict(x, apostrophe_dict))
+
+# Remove rare words
+from collections import Counter
+from itertools import chain
+
+# split words into lists
+v = text.str.split().tolist()
+# compute global word frequency
+c = Counter(chain.from_iterable(v))
+# filter, join, and re-assign
+text = [" ".join([j for j in i if c[j] > 1]) for i in v]
+text = pd.Series(text)
+
+total_word = 0
+for x, word in enumerate(text):
+    num_word = len(word.split())
+    # print(num_word)
+    total_word = total_word + num_word
+print(total_word)
diff --git a/...os/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py b/...os/kaggle/experiment/feedback-prize-english-language-learning_template/feature/feature.py
@@ -0,0 +1,16 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+
+class TfidfFeature:
+    def fit(self, train_df: pd.DataFrame):
+        train_df = np.array(train_df).tolist()
+        train_X = list(map("".join, train_df))
+        self.model = TfidfVectorizer(stop_words="english", max_df=0.5, min_df=0.01).fit(train_X)
+        # print(self.model.get_feature_names_out()[:5])
+
+    def transform(self, X: pd.DataFrame):
+        X = np.array(X).tolist()
+        X = list(map("".join, X))
+        return self.model.transform(X)
diff --git a/...narios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py b/...narios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model.py
@@ -0,0 +1,18 @@
+import pandas as pd
+from sklearn.multioutput import MultiOutputRegressor
+from sklearn.svm import SVR
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series):
+    model = MultiOutputRegressor(SVR())
+    model.fit(X_train, y_train)
+    return model
+
+
+def predict(model: MultiOutputRegressor, X_test: pd.DataFrame):
+    X_test_selected = select(X_test)
+    return model.predict(X_test_selected)
diff --git a/...nt/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py b/...nt/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/train.py
@@ -0,0 +1,37 @@
+# TODO: fix the train.py
+
+import importlib.util
+from pathlib import Path
+
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+y = target
+X = text[: len(train)]
+X_test = text[len(train) :]
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_test_f = cls.transform(X_test)
+
+    X_train_l.append(X_train_f)
+    X_test_l.append(X_test_f)
+
+
+submission["cohesion"] = predictions[:, 0]
+submission["syntax"] = predictions[:, 1]
+submission["vocabulary"] = predictions[:, 2]
+submission["phraseology"] = predictions[:, 3]
+submission["grammar"] = predictions[:, 4]
+submission["conventions"] = predictions[:, 5]
+
+submission.to_csv("submission.csv", index=False)  # writing data to a CSV file
diff --git a/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py b/rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
 from rdagent.components.coder.factor_coder.factor import (
     FactorFBWorkspace,
     FactorTask,
@@ -16,10 +17,14 @@
 class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl")
+        self.experiment_workspace = KGFBWorkspace(
+            template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
+        )
 
 
 class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl")
+        self.experiment_workspace = KGFBWorkspace(
+            template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
+        )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -42,6 +42,8 @@ class Config:

		competition: str = ""

		local_data_path: str = "/data/userdata/share/kaggle"

		rag_path: str = "git_ignore_folder/rag"


Expand Down