check forest & s3e11 & s4e8 & spaceship, change params of forest & s3…

…e26 & spaceship
microsoft · Sep 25, 2024 · 4c1e823 · 4c1e823
1 parent 2fe0e6e
commit 4c1e823
Show file tree

Hide file tree

Showing 15 changed files with 121 additions and 188 deletions.
diff --git a/...ent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py b/...ent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py
diff --git a/...scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_catboost.py b/...scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_catboost.py
@@ -0,0 +1,27 @@
+from catboost import CatBoostClassifier
+import pandas as pd
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    # Define CatBoost parameters
+    cat_params = {
+        'iterations': 5000,
+        'learning_rate': 0.03,
+        'od_wait': 1000,
+        'depth': 7,
+        'task_type': 'GPU',
+        'l2_leaf_reg': 3,
+        'eval_metric': 'Accuracy',
+        'devices': '0',
+        'verbose': 1000
+    }
+
+    # Initialize and train the CatBoost model
+    model = CatBoostClassifier(**cat_params)
+    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
+
+    return model
+
+def predict(model, X: pd.DataFrame):
+    # Predict using the trained model
+    y_pred = model.predict(X)
+    return y_pred.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_dnn.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_dnn.py
@@ -0,0 +1,75 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+import pandas as pd
+import numpy as np
+
+# Define the neural network model with Batch Normalization
+class NeuralNetwork(nn.Module):
+    def __init__(self, input_size, num_classes):
+        super(NeuralNetwork, self).__init__()
+        self.layer1 = nn.Linear(input_size, 128)
+        self.bn1 = nn.BatchNorm1d(128)
+        self.layer2 = nn.Linear(128, 64)
+        self.bn2 = nn.BatchNorm1d(64)
+        self.layer3 = nn.Linear(64, num_classes)
+
+    def forward(self, x):
+        x = torch.relu(self.bn1(self.layer1(x)))
+        x = torch.relu(self.bn2(self.layer2(x)))
+        x = torch.softmax(self.layer3(x), dim=1)
+        return x
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    # Convert data to PyTorch tensors
+    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
+    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
+    X_valid_tensor = torch.tensor(X_valid.values, dtype=torch.float32)
+    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long)
+
+    # Create datasets and dataloaders
+    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
+    valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)
+    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
+
+    # Initialize the model, loss function and optimizer
+    model = NeuralNetwork(input_size=X_train.shape[1], num_classes=len(set(y_train)))
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+    # Train the model
+    num_epochs = 150
+    for epoch in range(num_epochs):
+        model.train()
+        for X_batch, y_batch in train_loader:
+            optimizer.zero_grad()
+            outputs = model(X_batch)
+            loss = criterion(outputs, y_batch)
+            loss.backward()
+            optimizer.step()
+
+        # Validate the model
+        model.eval()
+        valid_loss = 0
+        correct = 0
+        with torch.no_grad():
+            for X_batch, y_batch in valid_loader:
+                outputs = model(X_batch)
+                valid_loss += criterion(outputs, y_batch).item()
+                _, predicted = torch.max(outputs, 1)
+                correct += (predicted == y_batch).sum().item()
+
+        accuracy = correct / len(valid_loader.dataset)
+        print(f'Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {accuracy:.4f}')
+
+    return model
+
+def predict(model, X):
+    X_tensor = torch.tensor(X.values, dtype=torch.float32)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(X_tensor)
+        _, predicted = torch.max(outputs, 1)
+    return predicted.numpy().reshape(-1, 1)
diff --git a/...arios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py b/...arios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
-    model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
+    model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
 
     # Select features (if any feature selection is needed)
     X_train_selected = select(X_train)

diff --git a/.../scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py b/.../scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
         "num_class": len(set(y_train)),  # Number of classes
         "nthread": -1,
     }
-    num_round = 20
+    num_round = 100
 
     evallist = [(dtrain, "train"), (dvalid, "eval")]
     bst = xgb.train(params, dtrain, num_round, evallist)

diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py
@@ -8,6 +8,7 @@
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import KFold
+from sklearn.preprocessing import StandardScaler
 
 # Set random seed for reproducibility
 SEED = 42
@@ -40,6 +41,7 @@ def import_module_from_path(module_name, module_path):
 # Store results
 accuracies = []
 y_test_pred_l = []
+scaler = StandardScaler()
 
 # 3) Train and evaluate using KFold
 fold_number = 1
@@ -80,6 +82,11 @@ def import_module_from_path(module_name, module_path):
     X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
     X_te = pd.DataFrame(imputer.transform(X_te), columns=X_te.columns)
 
+    # Standardize the data
+    X_tr = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
+    X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
+    X_te = pd.DataFrame(scaler.transform(X_te), columns=X_te.columns)
+
     # Remove duplicate columns
     X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()]
     X_val = X_val.loc[:, ~X_val.columns.duplicated()]

diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/model_xgboost.py
@@ -39,4 +39,4 @@ def predict(model, X_test):
     """
     X_test = select(X_test)
     y_pred = model.predict(X_test)
-    return y_pred
+    return y_pred.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py
@@ -85,7 +85,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 
 
 # For multiclass classification, use the mode of the predictions
-y_test_pred = np.mean(y_test_pred_l, axis=0)
+y_test_pred = np.mean(y_test_pred_l, axis=0).ravel()
 
 
 submission_result = pd.DataFrame(np.expm1(y_test_pred), columns=["cost"])

diff --git a/.../scenarios/kaggle/experiment/playground-series-s3e26_template/model/model_randomforest.py b/.../scenarios/kaggle/experiment/playground-series-s3e26_template/model/model_randomforest.py
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
-    model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)
+    model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
 
     # Select features (if any feature selection is needed)
     X_train_selected = select(X_train)

diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py
@@ -122,5 +122,5 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 y_test_pred_labels = np.where(y_test_pred == 1, "p", "e")  # 将整数转换回 'e' 或 'p'
 
 # 8) Submit predictions for the test set
-submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels})
+submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels.ravel()})
 submission_result.to_csv("submission.csv", index=False)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_nn.py
diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py
@@ -47,8 +47,8 @@ def fit(X_train, y_train, X_valid, y_valid):
 
     # Train the model
     model.train()
-    for epoch in range(5):
-        print(f"Epoch {epoch + 1}/5")
+    for epoch in range(100):
+        print(f"Epoch {epoch + 1}/100")
         epoch_loss = 0
         for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False):
             X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move data to the device
@@ -73,4 +73,4 @@ def predict(model, X):
             batch = X_tensor[i : i + 32]  # Predict in batches
             pred = model(batch).squeeze().cpu().numpy()  # Move results back to CPU
             predictions.extend(pred)
-    return np.array(predictions)  # Return boolean predictions
+    return np.array(predictions).reshape(-1, 1)  # Return predictions
diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_randomforest.py
@@ -51,4 +51,4 @@ def predict(model, X):
     y_pred_prob = model.predict_proba(X_selected)[:, 1]
 
     # Apply threshold to get boolean predictions
-    return y_pred_prob
+    return y_pred_prob.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py
@@ -37,4 +37,4 @@ def predict(model, X):
     X = select(X)
     dtest = xgb.DMatrix(X)
     y_pred_prob = model.predict(dtest)
-    return y_pred_prob
+    return y_pred_prob.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py
@@ -118,6 +118,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 
 y_test_pred = np.mean(y_test_pred_l, axis=0)
 y_test_pred = (y_test_pred > 0.5).astype(bool)
+y_test_pred = y_test_pred.ravel()
 
 submission_result = pd.DataFrame({"PassengerId": passenger_ids, "Transported": y_test_pred})