Skip to content

Commit

Permalink
check forest & s3e11 & s4e8 & spaceship, change params of forest & s3…
Browse files Browse the repository at this point in the history
…e26 & spaceship
  • Loading branch information
TPLin22 committed Sep 25, 2024
1 parent 2fe0e6e commit 4c1e823
Show file tree
Hide file tree
Showing 15 changed files with 121 additions and 188 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from catboost import CatBoostClassifier
import pandas as pd

def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
# Define CatBoost parameters
cat_params = {
'iterations': 5000,
'learning_rate': 0.03,
'od_wait': 1000,
'depth': 7,
'task_type': 'GPU',
'l2_leaf_reg': 3,
'eval_metric': 'Accuracy',
'devices': '0',
'verbose': 1000
}

# Initialize and train the CatBoost model
model = CatBoostClassifier(**cat_params)
model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

return model

def predict(model, X: pd.DataFrame):
# Predict using the trained model
y_pred = model.predict(X)
return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np

# Define the neural network model with Batch Normalization
class NeuralNetwork(nn.Module):
def __init__(self, input_size, num_classes):
super(NeuralNetwork, self).__init__()
self.layer1 = nn.Linear(input_size, 128)
self.bn1 = nn.BatchNorm1d(128)
self.layer2 = nn.Linear(128, 64)
self.bn2 = nn.BatchNorm1d(64)
self.layer3 = nn.Linear(64, num_classes)

def forward(self, x):
x = torch.relu(self.bn1(self.layer1(x)))
x = torch.relu(self.bn2(self.layer2(x)))
x = torch.softmax(self.layer3(x), dim=1)
return x

def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid.values, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long)

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

# Initialize the model, loss function and optimizer
model = NeuralNetwork(input_size=X_train.shape[1], num_classes=len(set(y_train)))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 150
for epoch in range(num_epochs):
model.train()
for X_batch, y_batch in train_loader:
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()

# Validate the model
model.eval()
valid_loss = 0
correct = 0
with torch.no_grad():
for X_batch, y_batch in valid_loader:
outputs = model(X_batch)
valid_loss += criterion(outputs, y_batch).item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == y_batch).sum().item()

accuracy = correct / len(valid_loader.dataset)
print(f'Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {accuracy:.4f}')

return model

def predict(model, X):
X_tensor = torch.tensor(X.values, dtype=torch.float32)
model.eval()
with torch.no_grad():
outputs = model(X_tensor)
_, predicted = torch.max(outputs, 1)
return predicted.numpy().reshape(-1, 1)
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
"num_class": len(set(y_train)), # Number of classes
"nthread": -1,
}
num_round = 20
num_round = 100

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
SEED = 42
Expand Down Expand Up @@ -40,6 +41,7 @@ def import_module_from_path(module_name, module_path):
# Store results
accuracies = []
y_test_pred_l = []
scaler = StandardScaler()

# 3) Train and evaluate using KFold
fold_number = 1
Expand Down Expand Up @@ -80,6 +82,11 @@ def import_module_from_path(module_name, module_path):
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
X_te = pd.DataFrame(imputer.transform(X_te), columns=X_te.columns)

# Standardize the data
X_tr = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_te = pd.DataFrame(scaler.transform(X_te), columns=X_te.columns)

# Remove duplicate columns
X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ def predict(model, X_test):
"""
X_test = select(X_test)
y_pred = model.predict(X_test)
return y_pred
return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:


# For multiclass classification, use the mode of the predictions
y_test_pred = np.mean(y_test_pred_l, axis=0)
y_test_pred = np.mean(y_test_pred_l, axis=0).ravel()


submission_result = pd.DataFrame(np.expm1(y_test_pred), columns=["cost"])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,5 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
y_test_pred_labels = np.where(y_test_pred == 1, "p", "e") # 将整数转换回 'e' 或 'p'

# 8) Submit predictions for the test set
submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels})
submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels.ravel()})
submission_result.to_csv("submission.csv", index=False)

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def fit(X_train, y_train, X_valid, y_valid):

# Train the model
model.train()
for epoch in range(5):
print(f"Epoch {epoch + 1}/5")
for epoch in range(100):
print(f"Epoch {epoch + 1}/100")
epoch_loss = 0
for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False):
X_batch, y_batch = X_batch.to(device), y_batch.to(device) # Move data to the device
Expand All @@ -73,4 +73,4 @@ def predict(model, X):
batch = X_tensor[i : i + 32] # Predict in batches
pred = model(batch).squeeze().cpu().numpy() # Move results back to CPU
predictions.extend(pred)
return np.array(predictions) # Return boolean predictions
return np.array(predictions).reshape(-1, 1) # Return predictions
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ def predict(model, X):
y_pred_prob = model.predict_proba(X_selected)[:, 1]

# Apply threshold to get boolean predictions
return y_pred_prob
return y_pred_prob.reshape(-1, 1)
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ def predict(model, X):
X = select(X)
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
return y_pred_prob
return y_pred_prob.reshape(-1, 1)
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:

y_test_pred = np.mean(y_test_pred_l, axis=0)
y_test_pred = (y_test_pred > 0.5).astype(bool)
y_test_pred = y_test_pred.ravel()

submission_result = pd.DataFrame({"PassengerId": passenger_ids, "Transported": y_test_pred})

Expand Down

0 comments on commit 4c1e823

Please sign in to comment.