Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: kaggle templates related #287

Merged
merged 6 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class Config:

competition: str = ""

local_data_path: str = "/data/userdata/share/kaggle"

rag_path: str = "git_ignore_folder/rag"


Expand Down
5 changes: 5 additions & 0 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from rdagent.core.utils import import_class
from rdagent.log import rdagent_logger as logger
from rdagent.log.time import measure_time
from rdagent.scenarios.kaggle.kaggle_crawler import download_data
from rdagent.scenarios.kaggle.proposal.proposal import (
KG_ACTION_FEATURE_ENGINEERING,
KG_ACTION_FEATURE_PROCESSING,
Expand Down Expand Up @@ -89,6 +90,10 @@ def main(path=None, step_n=None, competition=None):
"""
if competition:
KAGGLE_IMPLEMENT_SETTING.competition = competition
download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
else:
logger.error("Please specify competition name.")

if path is None:
kaggle_loop = KaggleRDLoop(KAGGLE_IMPLEMENT_SETTING)
else:
Expand Down
2 changes: 0 additions & 2 deletions rdagent/scenarios/kaggle/developer/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
KGModelExperiment,
)

META_TPL_DIR = Path(__file__).parent.parent / "experiment" / "meta_tpl"


class KGCachedRunner(CachedRunner[ASpecificExp]):
def build_from_SOTA(self, exp: ASpecificExp) -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# TODO: Fix
import re

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train = pd.read_csv("/kaggle/input/train.csv")
test = pd.read_csv("/kaggle/input/test.csv")
submission = pd.read_csv("/kaggle/input/sample_submission.csv")


features = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
target = train[features]


text_train = train["full_text"]
text_test = test["full_text"]

text = pd.concat([text_train, text_test], ignore_index=True)


count_words = text.str.findall(r"(\w+)").str.len()
print(count_words.sum())


""" Cleaning Text """
text = text.str.lower()

# removing special characters and numbers
text = text.apply(lambda x: re.sub("[^a-z]\s", "", x))

# remove hash tags
text = text.str.replace("#", "")

# remove words less than 3 character and greater than 7
text = text.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2 and len(w) < 8]))

# removing stopwords
# text = text.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))

count_words = text.str.findall(r"(\w+)").str.len()
print(count_words.sum())


most_freq_words = pd.Series(" ".join(text).lower().split()).value_counts()[:25]
text = text.apply(lambda x: " ".join(word for word in x.split() if word not in most_freq_words))

count_words = text.str.findall(r"(\w+)").str.len()

apostrophe_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have",
}


def lookup_dict(txt, dictionary):
for word in txt.split():
if word.lower() in dictionary:
if word.lower() in txt.split():
txt = txt.replace(word, dictionary[word.lower()])
return txt


text = text.apply(lambda x: lookup_dict(x, apostrophe_dict))

# Remove rare words
from collections import Counter
from itertools import chain

# split words into lists
v = text.str.split().tolist()
# compute global word frequency
c = Counter(chain.from_iterable(v))
# filter, join, and re-assign
text = [" ".join([j for j in i if c[j] > 1]) for i in v]
text = pd.Series(text)

total_word = 0
for x, word in enumerate(text):
num_word = len(word.split())
# print(num_word)
total_word = total_word + num_word
print(total_word)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


class TfidfFeature:
def fit(self, train_df: pd.DataFrame):
train_df = np.array(train_df).tolist()
train_X = list(map("".join, train_df))
self.model = TfidfVectorizer(stop_words="english", max_df=0.5, min_df=0.01).fit(train_X)
# print(self.model.get_feature_names_out()[:5])

def transform(self, X: pd.DataFrame):
X = np.array(X).tolist()
X = list(map("".join, X))
return self.model.transform(X)
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR


def select(X: pd.DataFrame) -> pd.DataFrame:
return X


def fit(X_train: pd.DataFrame, y_train: pd.Series):
model = MultiOutputRegressor(SVR())
model.fit(X_train, y_train)
return model


def predict(model: MultiOutputRegressor, X_test: pd.DataFrame):
X_test_selected = select(X_test)
return model.predict(X_test_selected)
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# TODO: fix the train.py

import importlib.util
from pathlib import Path


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


DIRNAME = Path(__file__).absolute().resolve().parent

y = target
X = text[: len(train)]
X_test = text[len(train) :]

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_test_f = cls.transform(X_test)

X_train_l.append(X_train_f)
X_test_l.append(X_test_f)


submission["cohesion"] = predictions[:, 0]
submission["syntax"] = predictions[:, 1]
submission["vocabulary"] = predictions[:, 2]
submission["phraseology"] = predictions[:, 3]
submission["grammar"] = predictions[:, 4]
submission["conventions"] = predictions[:, 5]

submission.to_csv("submission.csv", index=False) # writing data to a CSV file
9 changes: 7 additions & 2 deletions rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.factor import (
FactorFBWorkspace,
FactorTask,
Expand All @@ -16,10 +17,14 @@
class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl")
self.experiment_workspace = KGFBWorkspace(
template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
)


class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl")
self.experiment_workspace = KGFBWorkspace(
template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
)
Loading