rf_selection.Rmd

---
title: "random_forest"
author: "Peter"
date: "`r Sys.Date()`"
output: html_document
---


```{r}
# Load necessary libraries
library(data.table)
library(randomForest)
library(caret)
library(pROC)

# Load the dataset
RF_impute_df <- fread("RF_imputation_NEW.csv")

# Subset the data to exclude unnecessary columns
RF_complete_df <- subset(RF_impute_df, select = -c(deathtime, survival_time, LOS, Unnamed_0, V1, admittime, ID, group, tLOS))

# Ensure the outcome variable is included and properly named
if (!("outcome" %in% names(RF_complete_df))) {
  stop("The 'outcome' column does not exist in the dataframe.")
}

# Ensure the outcome variable has valid factor levels
RF_complete_df$outcome <- factor(RF_complete_df$outcome, levels = unique(RF_complete_df$outcome), labels = make.names(unique(RF_complete_df$outcome)))

# Print the column names to verify
cat("Column names in the dataset:\n")
print(names(RF_complete_df))

# Prepare the predictor matrix and outcome variable
x_rf <- as.matrix(RF_complete_df[, setdiff(names(RF_complete_df), "outcome"), with = FALSE])
y_rf <- RF_complete_df$outcome

# Set seed for reproducibility
set.seed(510)

# Define cross-validation control
train_control <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

# Define the grid for hyperparameter tuning
tune_grid <- expand.grid(
  mtry = 10  # Number of variables randomly sampled as candidates at each split
)

# Train the Random Forest model using cross-validation with the specified parameters
rf_cv_model <- train(
  x = x_rf, 
  y = y_rf,
  method = "rf",
  trControl = train_control,
  tuneGrid = tune_grid,
  metric = "ROC",
  importance = TRUE,
  ntree = 500,  # Number of trees
  nodesize = 10  # Minimum size of terminal nodes
)

# Print the results of the cross-validation
print(rf_cv_model)

# Extract the cross-validated AUC
auc <- rf_cv_model$results$ROC[rf_cv_model$results$ROC == max(rf_cv_model$results$ROC)]
cat("Cross-validated AUC:", auc, "\n")

# Predict on the test data (if you have a separate test set)
# Here we use the cross-validated predictions
predictions <- rf_cv_model$pred$pred
probs <- rf_cv_model$pred[, make.names(levels(y_rf)[2])]  # Assuming the positive class is the second level

# Calculate log-loss
log_loss <- function(y_true, y_prob) {
  epsilon <- 1e-15
  y_prob <- pmax(epsilon, pmin(1 - epsilon, y_prob))
  -mean(y_true * log(y_prob) + (1 - y_true) * log(1 - y_prob))
}

# Convert factor levels to 0 and 1 for log-loss calculation
y_true <- ifelse(rf_cv_model$pred$obs == levels(y_rf)[2], 1, 0)
log_loss_value <- log_loss(y_true, probs)
cat("Cross-validated log-loss:", log_loss_value, "\n")

# Extract feature importances correctly
importance_rf <- varImp(rf_cv_model, scale = FALSE)
importance_values <- importance_rf$importance[, 1]  # Extract importance values

# Print feature importances
cat("Feature importances from Random Forest (cross-validated):\n")
print(importance_values)

# Plot feature importances
plot(importance_rf, main = "Variable Importance (Cross-Validated)")

# Select important features based on a threshold
important_vars <- rownames(importance_rf$importance)[importance_values > mean(importance_values)]
cat("Selected important variables:\n")
print(important_vars)


```

```{r}
# Load necessary libraries
library(data.table)
library(randomForest)
library(caret)
library(pROC)

# Load the dataset
RF_impute_df <- fread("RF_imputation_NEW.csv")

# Subset the data to include only selected important variables and outcome
important_features <- c("Renal_failure", "heart_rate", "temperature", "SP_O2", "Urine_output", "RBC", "Leucocyte", "Platelets", 
                        "Neutrophils", "Lymphocyte", "PT", "Creatinine", "Urea_nitrogen", "Blood_sodium", "Blood_calcium", 
                        "Anion_gap", "Magnesium_ion", "Bicarbonate", "Lactic_acid", "comorb_score", "outcome")
RF_simplified_df <- RF_impute_df[, ..important_features]

# Ensure the outcome variable has valid factor levels
RF_simplified_df$outcome <- factor(RF_simplified_df$outcome, levels = unique(RF_simplified_df$outcome), labels = make.names(unique(RF_simplified_df$outcome)))

# Prepare the predictor matrix and outcome variable
x_simplified <- as.matrix(RF_simplified_df[, -"outcome", with = FALSE])
y_simplified <- RF_simplified_df$outcome

# Set seed for reproducibility
set.seed(510)

# Define cross-validation control
train_control <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

# Train the Random Forest model using cross-validation with the simplified dataset
rf_cv_model_simplified <- train(
  x = x_simplified, 
  y = y_simplified,
  method = "rf",
  trControl = train_control,
  metric = "ROC",
  importance = TRUE,
  ntree = 500,  # Number of trees
  nodesize = 10,  # Minimum size of terminal nodes
  tuneGrid = expand.grid(mtry = 10)  # Number of variables randomly sampled as candidates at each split
)

# Print the results of the cross-validation
print(rf_cv_model_simplified)

# Extract the cross-validated AUC
auc_simplified <- rf_cv_model_simplified$results$ROC[rf_cv_model_simplified$results$ROC == max(rf_cv_model_simplified$results$ROC)]
cat("Cross-validated AUC for simplified model:", auc_simplified, "\n")

# Predict on the test data (if you have a separate test set)
# Here we use the cross-validated predictions
predictions_simplified <- rf_cv_model_simplified$pred$pred
probs_simplified <- rf_cv_model_simplified$pred[, make.names(levels(y_simplified)[2])]  # Assuming the positive class is the second level

# Calculate log-loss
log_loss <- function(y_true, y_prob) {
  epsilon <- 1e-15
  y_prob <- pmax(epsilon, pmin(1 - epsilon, y_prob))
  -mean(y_true * log(y_prob) + (1 - y_true) * log(1 - y_prob))
}

# Convert factor levels to 0 and 1 for log-loss calculation
y_true_simplified <- ifelse(rf_cv_model_simplified$pred$obs == levels(y_simplified)[2], 1, 0)
log_loss_value_simplified <- log_loss(y_true_simplified, probs_simplified)
cat("Cross-validated log-loss for simplified model:", log_loss_value_simplified, "\n")

# Extract feature importances for the simplified model
importance_rf_simplified <- varImp(rf_cv_model_simplified, scale = FALSE)

# Print feature importances for the simplified model
cat("Feature importances from Random Forest (simplified model):\n")
print(importance_rf_simplified)

# Plot feature importances for the simplified model
plot(importance_rf_simplified, main = "Variable Importance (Cross-Validated, Simplified Model)")

```


```{r}
# Verify no data leakage between training and validation sets
train_idx <- createDataPartition(y_rf, p = 0.8, list = FALSE)
train_data <- RF_complete_df[train_idx, ]
test_data <- RF_complete_df[-train_idx, ]

# Check overlap
intersect(rownames(train_data), rownames(test_data))

```


```{r}
# Load necessary libraries
library(data.table)
library(randomForest)
library(caret)
library(pROC)

# Load the dataset
RF_impute_df <- fread("RF_imputation_NEW.csv")

# Subset the data to exclude unnecessary columns
RF_complete_df <- subset(RF_impute_df, select = -c(deathtime, survival_time, LOS, Unnamed_0, V1, admittime, ID, group, tLOS, Anion_gap, subject_id))

# Ensure the outcome variable is included and properly named
if (!("outcome" %in% names(RF_complete_df))) {
  stop("The 'outcome' column does not exist in the dataframe.")
}

# Ensure the outcome variable has valid factor levels
RF_complete_df$outcome <- factor(RF_complete_df$outcome, levels = unique(RF_complete_df$outcome), labels = make.names(unique(RF_complete_df$outcome)))

# Add unique row identifiers
RF_complete_df$rn <- seq_len(nrow(RF_complete_df))

# Properly partition data to avoid leakage
set.seed(510)
train_idx <- createDataPartition(RF_complete_df$outcome, p = 0.8, list = FALSE)
train_data <- RF_complete_df[train_idx, ]
test_data <- RF_complete_df[-train_idx, ]

# Ensure no overlap
if (length(intersect(train_data$rn, test_data$rn)) == 0) {
  cat("No data leakage detected.\n")
} else {
  stop("Data leakage detected.")
}

# Check the sizes of the training and test sets
cat("Training set size:", nrow(train_data), "\n")
cat("Test set size:", nrow(test_data), "\n")

# Remove the unique row identifiers
train_data <- train_data[, -c("rn")]
test_data <- test_data[, -c("rn")]

# Prepare the predictor matrix and outcome variable for training data
x_rf_train <- as.matrix(train_data[, setdiff(names(train_data), "outcome"), with = FALSE])
y_rf_train <- train_data$outcome

# Prepare the predictor matrix and outcome variable for test data
x_rf_test <- as.matrix(test_data[, setdiff(names(test_data), "outcome"), with = FALSE])
y_rf_test <- test_data$outcome

# Set seed for reproducibility
set.seed(510)

# Define cross-validation control
train_control <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

# Define the grid for hyperparameter tuning
tune_grid <- expand.grid(
  mtry = seq(2, 10, 2)  # Number of variables randomly sampled as candidates at each split
)

# Train the Random Forest model using cross-validation with the specified parameters
rf_cv_model <- train(
  x = x_rf_train, 
  y = y_rf_train,
  method = "rf",
  trControl = train_control,
  tuneGrid = tune_grid,
  metric = "ROC",
  importance = TRUE,
  ntree = 100,  # Further reduce number of trees to prevent overfitting
  nodesize = 20  # Further increase the minimum size of terminal node  
)

# Print the results of the cross-validation
print(rf_cv_model)

# Evaluate the model on the test set
test_pred_probs <- predict(rf_cv_model, newdata = x_rf_test, type = "prob")[, 2]
test_roc <- roc(response = as.numeric(y_rf_test) - 1, predictor = test_pred_probs)
test_auc <- test_roc$auc
cat("Test AUC:", test_auc, "\n")

# Confusion matrix on test set
test_pred_class <- ifelse(test_pred_probs > 0.5, 1, 0)
conf_matrix_test <- table(Predicted = test_pred_class, Actual = as.numeric(y_rf_test) - 1)
print(conf_matrix_test)

# Extract and plot feature importances
importance_rf <- varImp(rf_cv_model, scale = FALSE)
importance_values <- importance_rf$importance[, 1]  # Extract importance values

# Print feature importances
cat("Feature importances from Random Forest (cross-validated):\n")
print(importance_values)

# Plot feature importances
plot(importance_rf, main = "Variable Importance (Cross-Validated)")

# Generate learning curves
train_sizes <- seq(0.1, 1.0, by = 0.1)
train_scores <- numeric(length(train_sizes))
test_scores <- numeric(length(train_sizes))

for (i in seq_along(train_sizes)) {
  train_size <- train_sizes[i]
  train_idx <- createDataPartition(y_rf_train, p = train_size, list = FALSE)
  x_train_part <- x_rf_train[train_idx, ]
  y_train_part <- y_rf_train[train_idx]
  
  if (length(train_idx) > 0 && nrow(x_train_part) > 0 && nrow(x_rf_train[-train_idx, ]) > 0) {
    rf_cv_model_part <- train(
      x = x_train_part,
      y = y_train_part,
      method = "rf",
      trControl = train_control,
      tuneGrid = tune_grid,
      metric = "ROC",
      importance = TRUE,
      ntree = 100,  # Further reduce number of trees to prevent overfitting
      nodesize = 5,  # Further increase the minimum size of terminal nodes
      maxnodes = 30  # Ensure maxnodes is greater than nodesize
    )
    train_pred <- predict(rf_cv_model_part, newdata = x_train_part, type = "prob")[, 2]  # Probabilities for positive class
    test_pred <- predict(rf_cv_model_part, newdata = x_rf_train[-train_idx, ], type = "prob")[, 2]  # Probabilities for positive class

    # Calculate AUC for training and test sets
    train_scores[i] <- roc(response = as.numeric(y_train_part) - 1, predictor = train_pred)$auc
    test_scores[i] <- roc(response = as.numeric(y_rf_train[-train_idx]) - 1, predictor = test_pred)$auc
  } else {
    train_scores[i] <- NA
    test_scores[i] <- NA
  }
}

# Function to plot learning curves
plot_learning_curve <- function(train_sizes, train_scores, test_scores) {
  plot(train_sizes, train_scores, type = "o", col = "blue", ylim = range(c(train_scores, test_scores), na.rm = TRUE), xlab = "Training Size", ylab = "Score", main = "Learning Curves")
  lines(train_sizes, test_scores, type = "o", col = "red")
  legend("bottomright", legend = c("Train", "Validation"), col = c("blue", "red"), lty = 1)
}

# Plot the learning curves
plot_learning_curve(train_sizes, train_scores, test_scores)

cat("Training AUCs:", train_scores, "\n")
cat("Validation AUCs:", test_scores, "\n")

```

```{r}
rm(list=ls())
getwd()
```


```{r}
# Load necessary libraries
#library(data.table)
#library(randomForest)
#library(caret)
#library(pROC)

# Load the dataset
RF_impute_df <- fread("RF_imputation_NEW.csv")

# Subset the data to exclude unnecessary columns
RF_complete_df <- subset(RF_impute_df, select = -c(deathtime, survival_time, LOS, Unnamed_0, V1, admittime, ID, group, tLOS))

# Ensure the outcome variable is included and properly named
if (!("outcome" %in% names(RF_complete_df))) {
  stop("The 'outcome' column does not exist in the dataframe.")
}

# Ensure the outcome variable has valid factor levels
RF_complete_df$outcome <- factor(RF_complete_df$outcome, levels = unique(RF_complete_df$outcome), labels = make.names(unique(RF_complete_df$outcome)))

# Add unique row identifiers
RF_complete_df$rn <- seq_len(nrow(RF_complete_df))

# Properly partition data to avoid leakage
set.seed(630)
train_idx <- createDataPartition(RF_complete_df$outcome, p = 0.8, list = FALSE)
train_data <- RF_complete_df[train_idx, ]
test_data <- RF_complete_df[-train_idx, ]

# Ensure no overlap
if (length(intersect(train_data$rn, test_data$rn)) == 0) {
  cat("No data leakage detected.\n")
} else {
  stop("Data leakage detected.")
}

# Check the sizes of the training and test sets
cat("Training set size:", nrow(train_data), "\n")
cat("Test set size:", nrow(test_data), "\n")

# Remove the unique row identifiers
train_data <- train_data[, -c("rn")]
test_data <- test_data[, -c("rn")]

# Prepare the predictor matrix and outcome variable for training data
x_rf_train <- as.matrix(train_data[, setdiff(names(train_data), "outcome"), with = FALSE])
y_rf_train <- train_data$outcome

# Prepare the predictor matrix and outcome variable for test data
x_rf_test <- as.matrix(test_data[, setdiff(names(test_data), "outcome"), with = FALSE])
y_rf_test <- test_data$outcome

# Set seed for reproducibility
set.seed(630)

# Define cross-validation control
train_control <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

# Train the Random Forest model with more trees and wider hyperparameter tuning
rf_cv_model <- train(
  x = x_rf_train, 
  y = y_rf_train,
  method = "rf",
  trControl = train_control,
  tuneLength = 10,
  metric = "ROC",
  importance = TRUE,
  ntree = 100,  # Increased number of trees
  nodesize = 20,  # Increased minimum size of terminal nodes for regularization
)

# Print the results of the cross-validation
print(rf_cv_model)

# Evaluate the model on the test set
test_pred_probs <- predict(rf_cv_model, newdata = x_rf_test, type = "prob")[, 2]
test_roc <- roc(response = as.numeric(y_rf_test) - 1, predictor = test_pred_probs)
test_auc <- test_roc$auc
cat("Test AUC:", test_auc, "\n")

# Confusion matrix on test set
test_pred_class <- ifelse(test_pred_probs > 0.5, 1, 0)
conf_matrix_test <- table(Predicted = test_pred_class, Actual = as.numeric(y_rf_test) - 1)
print(conf_matrix_test)

# Extract and plot feature importances
importance_rf <- varImp(rf_cv_model, scale = FALSE)
importance_values <- importance_rf$importance[, 1]  # Extract importance values

# Print feature importances
cat("Feature importances from Random Forest (cross-validated):\n")
print(importance_values)

# Plot feature importances
plot(importance_rf, main = "Variable Importance (Cross-Validated)")

# Generate learning curves
train_sizes <- seq(0.1, 1.0, by = 0.1)
train_scores <- numeric(length(train_sizes))
test_scores <- numeric(length(train_sizes))

for (i in seq_along(train_sizes)) {
  train_size <- train_sizes[i]
  train_idx <- createDataPartition(y_rf_train, p = train_size, list = FALSE)
  x_train_part <- x_rf_train[train_idx, ]
  y_train_part <- y_rf_train[train_idx]
  
  if (length(train_idx) > 0 && nrow(x_train_part) > 0 && nrow(x_rf_train[-train_idx, ]) > 0) {
    rf_cv_model_part <- train(
      x = x_train_part,
      y = y_train_part,
      method = "rf",
      trControl = train_control,
      tuneLength = 10,
      metric = "ROC",
      importance = TRUE,
      ntree = 500,  # Increased number of trees
      nodesize = 10,  # Increased minimum size of terminal nodes
      maxnodes = 20  # Reduced maximum number of terminal nodes
    )
    train_pred <- predict(rf_cv_model_part, newdata = x_train_part, type = "prob")[, 2]  # Probabilities for positive class
    test_pred <- predict(rf_cv_model_part, newdata = x_rf_train[-train_idx, ], type = "prob")[, 2]  # Probabilities for positive class

    # Calculate AUC for training and test sets
    train_scores[i] <- roc(response = as.numeric(y_train_part) - 1, predictor = train_pred)$auc
    test_scores[i] <- roc(response = as.numeric(y_rf_train[-train_idx]) - 1, predictor = test_pred)$auc
  } else {
    train_scores[i] <- NA
    test_scores[i] <- NA
  }
}

# Function to plot learning curves
plot_learning_curve <- function(train_sizes, train_scores, test_scores) {
  plot(train_sizes, train_scores, type = "o", col = "blue", ylim = range(c(train_scores, test_scores), na.rm = TRUE), xlab = "Training Size", ylab = "Score", main = "Learning Curves")
  lines(train_sizes, test_scores, type = "o", col = "red")
  legend("bottomright", legend = c("Train", "Validation"), col = c("blue", "red"), lty = 1)
}

# Plot the learning curves
plot_learning_curve(train_sizes, train_scores, test_scores)

cat("Training AUCs:", train_scores, "\n")
cat("Validation AUCs:", test_scores, "\n")
```
debugging
```{r}
# Simplified code for debugging
library(data.table)
library(randomForest)
library(caret)
library(pROC)

# Load the dataset
RF_impute_df <- fread("RF_imputation_NEW.csv")

# Subset the data to exclude unnecessary columns
RF_complete_df <- subset(RF_impute_df, select = -c(deathtime, survival_time, LOS, Unnamed_0, V1, admittime, ID, group, tLOS))

# Ensure the outcome variable is included and properly named
if (!("outcome" %in% names(RF_complete_df))) {
  stop("The 'outcome' column does not exist in the dataframe.")
}

# Ensure the outcome variable has valid factor levels
RF_complete_df$outcome <- factor(RF_complete_df$outcome, levels = unique(RF_complete_df$outcome), labels = make.names(unique(RF_complete_df$outcome)))

# Add unique row identifiers
RF_complete_df$rn <- seq_len(nrow(RF_complete_df))

# Properly partition data to avoid leakage
set.seed(510)
train_idx <- createDataPartition(RF_complete_df$outcome, p = 0.8, list = FALSE)
train_data <- RF_complete_df[train_idx, ]
test_data <- RF_complete_df[-train_idx, ]

# Ensure no overlap
if (length(intersect(train_data$rn, test_data$rn)) == 0) {
  cat("No data leakage detected.\n")
} else {
  stop("Data leakage detected.")
}

# Remove the unique row identifiers
train_data <- train_data[, -c("rn")]
test_data <- test_data[, -c("rn")]

# Prepare the predictor matrix and outcome variable for training data
x_rf_train <- as.matrix(train_data[, setdiff(names(train_data), "outcome"), with = FALSE])
y_rf_train <- train_data$outcome

# Prepare the predictor matrix and outcome variable for test data
x_rf_test <- as.matrix(test_data[, setdiff(names(test_data), "outcome"), with = FALSE])
y_rf_test <- test_data$outcome

# Debugging output
print("Starting model training...")
print(dim(x_rf_train))
print(dim(y_rf_train))

# Set seed for reproducibility
set.seed(510)

# Define cross-validation control
train_control <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

# Train the Random Forest model with more trees and wider hyperparameter tuning
rf_cv_model <- train(
  x = x_rf_train, 
  y = y_rf_train,
  method = "rf",
  trControl = train_control,
  tuneLength = 3,  # Reduced tune length for faster debugging
  metric = "ROC",
  importance = TRUE,
  ntree = 100,  # Reduced number of trees for faster debugging
  nodesize = 20,  # Increased minimum size of terminal nodes for regularization
  maxnodes = 10  # Reduced maximum number of terminal nodes for regularization
)

# Print the results of the cross-validation
print(rf_cv_model)

# Evaluate the model on the test set
test_pred_probs <- predict(rf_cv_model, newdata = x_rf_test, type = "prob")[, 2]
test_roc <- roc(response = as.numeric(y_rf_test) - 1, predictor = test_pred_probs)
test_auc <- test_roc$auc
cat("Test AUC:", test_auc, "\n")

# Confusion matrix on test set
test_pred_class <- ifelse(test_pred_probs > 0.5, 1, 0)
conf_matrix_test <- table(Predicted = test_pred_class, Actual = as.numeric(y_rf_test) - 1)
print(conf_matrix_test)

# Extract and plot feature importances
importance_rf <- varImp(rf_cv_model, scale = FALSE)
importance_values <- importance_rf$importance[, 1]  # Extract importance values

# Print feature importances
cat("Feature importances from Random Forest (cross-validated):\n")
print(importance_values)

# Plot feature importances
png("variable_importance.png")
plot(importance_rf, main = "Variable Importance (Cross-Validated)")
dev.off()
```

```{r}
# Load necessary libraries
library(data.table)
library(randomForest)
library(caret)
library(pROC)

# Load the dataset
RF_impute_df <- fread("RF_imputation_NEW.csv")

# Subset the data to exclude unnecessary columns
RF_complete_df <- subset(RF_impute_df, select = -c(deathtime, survival_time, LOS, Unnamed_0, V1, admittime, ID, group, tLOS))

# Ensure the outcome variable is included and properly named
if (!("outcome" %in% names(RF_complete_df))) {
  stop("The 'outcome' column does not exist in the dataframe.")
}

# Ensure the outcome variable has valid factor levels
RF_complete_df$outcome <- factor(RF_complete_df$outcome, levels = unique(RF_complete_df$outcome), labels = make.names(unique(RF_complete_df$outcome)))

# Add unique row identifiers
RF_complete_df$rn <- seq_len(nrow(RF_complete_df))

# Properly partition data to avoid leakage
set.seed(510)
train_idx <- createDataPartition(RF_complete_df$outcome, p = 0.8, list = FALSE)
train_data <- RF_complete_df[train_idx, ]
test_data <- RF_complete_df[-train_idx, ]

# Ensure no overlap
if (length(intersect(train_data$rn, test_data$rn)) == 0) {
  cat("No data leakage detected.\n")
} else {
  stop("Data leakage detected.")
}

# Remove the unique row identifiers
train_data <- train_data[, -c("rn")]
test_data <- test_data[, -c("rn")]

# Prepare the predictor matrix and outcome variable for training data
x_rf_train <- as.matrix(train_data[, setdiff(names(train_data), "outcome"), with = FALSE])
y_rf_train <- train_data$outcome

# Prepare the predictor matrix and outcome variable for test data
x_rf_test <- as.matrix(test_data[, setdiff(names(test_data), "outcome"), with = FALSE])
y_rf_test <- test_data$outcome

# Debugging output
print("Starting model training...")
print(dim(x_rf_train))
print(dim(y_rf_train))

# Set seed for reproducibility
set.seed(510)

# Define cross-validation control
train_control <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

# Enhanced hyperparameter tuning with a grid search
tune_grid <- expand.grid(
  mtry = c(2, 5, 10, 15, 20)  # Adjusted mtry values
)

rf_cv_model <- train(
  x = x_rf_train, 
  y = y_rf_train,
  method = "rf",
  trControl = train_control,
  tuneGrid = tune_grid,
  metric = "ROC",
  importance = TRUE,
  ntree = 200,  # Reduced ntree
  nodesize = 20  # Increased nodesize
)

# Print the results of the cross-validation
print(rf_cv_model)

# Evaluate the model on the training set
train_pred_probs <- predict(rf_cv_model, newdata = x_rf_train, type = "prob")[, 2]
train_roc <- roc(response = as.numeric(y_rf_train) - 1, predictor = train_pred_probs)
train_auc <- train_roc$auc
cat("Train AUC:", train_auc, "\n")

# Confusion matrix on training set
train_pred_class <- ifelse(train_pred_probs > 0.5, 1, 0)
conf_matrix_train <- table(Predicted = train_pred_class, Actual = as.numeric(y_rf_train) - 1)
print(conf_matrix_train)

# Evaluate the model on the test set
test_pred_probs <- predict(rf_cv_model, newdata = x_rf_test, type = "prob")[, 2]
test_roc <- roc(response = as.numeric(y_rf_test) - 1, predictor = test_pred_probs)
test_auc <- test_roc$auc
cat("Test AUC:", test_auc, "\n")

# Confusion matrix on test set
test_pred_class <- ifelse(test_pred_probs > 0.5, 1, 0)
conf_matrix_test <- table(Predicted = test_pred_class, Actual = as.numeric(y_rf_test) - 1)
print(conf_matrix_test)

# Compare AUC values
cat("Training AUC vs Test AUC:\n")
cat("Training AUC:", train_auc, "\n")
cat("Test AUC:", test_auc, "\n")

# Compare confusion matrices
cat("Confusion Matrix - Training Data:\n")
print(conf_matrix_train)
cat("Confusion Matrix - Test Data:\n")
print(conf_matrix_test)

# Extract and plot feature importances
importance_rf <- varImp(rf_cv_model, scale = FALSE)
importance_values <- importance_rf$importance[, 1]  # Extract importance values

# Print feature importances
cat("Feature importances from Random Forest (cross-validated):\n")
print(importance_values)

# Plot feature importances
png("variable_importance.png")
plot(importance_rf, main = "Variable Importance (Cross-Validated)")
dev.off()
```


tests for stuff: model is definitely overfitting, bedtime though
```{r}
# Load necessary libraries
library(data.table)
library(randomForest)
library(caret)
library(pROC)
library(smotefamily)

# Load the dataset
RF_impute_df <- fread("RF_imputation_NEW.csv")

# Subset the data to exclude unnecessary columns
RF_complete_df <- subset(RF_impute_df, select = -c(deathtime, survival_time, LOS, Unnamed_0, V1, admittime, ID, group, tLOS))

# Ensure the outcome variable is included and properly named
if (!("outcome" %in% names(RF_complete_df))) {
  stop("The 'outcome' column does not exist in the dataframe.")
}

# Ensure the outcome variable has valid factor levels
RF_complete_df$outcome <- factor(RF_complete_df$outcome, levels = unique(RF_complete_df$outcome), labels = make.names(unique(RF_complete_df$outcome)))

# Add unique row identifiers
RF_complete_df$rn <- seq_len(nrow(RF_complete_df))

# Properly partition data to avoid leakage
set.seed(857)
train_idx <- createDataPartition(RF_complete_df$outcome, p = 0.8, list = FALSE)
train_data <- RF_complete_df[train_idx, ]
test_data <- RF_complete_df[-train_idx, ]

# Ensure no overlap
if (length(intersect(train_data$rn, test_data$rn)) == 0) {
  cat("No data leakage detected.\n")
} else {
  stop("Data leakage detected.")
}

# Remove the unique row identifiers
train_data <- train_data[, -c("rn")]
test_data <- test_data[, -c("rn")]

# Prepare the predictor matrix and outcome variable for training data
x_rf_train <- as.matrix(train_data[, setdiff(names(train_data), "outcome"), with = FALSE])
y_rf_train <- train_data$outcome

# Prepare the predictor matrix and outcome variable for test data
x_rf_test <- as.matrix(test_data[, setdiff(names(test_data), "outcome"), with = FALSE])
y_rf_test <- test_data$outcome

# Debugging output
print("Starting model training...")
print(dim(x_rf_train))
print(dim(y_rf_train))

# Set seed for reproducibility
set.seed(857)

# Define cross-validation control with SMOTE sampling to handle class imbalance
train_control <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final",
  sampling = "smote"  # Apply SMOTE for oversampling the minority class
)

# Enhanced hyperparameter tuning with a grid search
tune_grid <- expand.grid(
  mtry = c(2, 5, 10, 15, 20)  # Adjusted mtry values
)

# Train the Random Forest model
rf_cv_model <- train(
  x = x_rf_train, 
  y = y_rf_train,
  method = "rf",
  trControl = train_control,
  tuneGrid = tune_grid,
  metric = "ROC",
  importance = TRUE,
  ntree = 500,  # Reduced ntree
  nodesize = 300  # Increased nodesize
)

# Print the results of the cross-validation
print(rf_cv_model)

# Evaluate the model on the training set
train_pred_probs <- predict(rf_cv_model, newdata = x_rf_train, type = "prob")[, 2]
train_roc <- roc(response = as.numeric(y_rf_train) - 1, predictor = train_pred_probs)
train_auc <- train_roc$auc
cat("Train AUC:", train_auc, "\n")

# Confusion matrix on training set
train_pred_class <- ifelse(train_pred_probs > 0.5, 1, 0)
conf_matrix_train <- table(Predicted = train_pred_class, Actual = as.numeric(y_rf_train) - 1)
print(conf_matrix_train)

# Evaluate the model on the test set
test_pred_probs <- predict(rf_cv_model, newdata = x_rf_test, type = "prob")[, 2]
test_roc <- roc(response = as.numeric(y_rf_test) - 1, predictor = test_pred_probs)
test_auc <- test_roc$auc
cat("Test AUC:", test_auc, "\n")

# Confusion matrix on test set
test_pred_class <- ifelse(test_pred_probs > 0.5, 1, 0)
conf_matrix_test <- table(Predicted = test_pred_class, Actual = as.numeric(y_rf_test) - 1)
print(conf_matrix_test)

# Compare AUC values
cat("Training AUC vs Test AUC:\n")
cat("Training AUC:", train_auc, "\n")
cat("Test AUC:", test_auc, "\n")

# Compare confusion matrices
cat("Confusion Matrix - Training Data:\n")
print(conf_matrix_train)
cat("Confusion Matrix - Test Data:\n")
print(conf_matrix_test)

# Check for class imbalance in training data
cat("Class distribution in training data:\n")
print(prop.table(table(y_rf_train)))

# Check for class imbalance in test data
cat("Class distribution in test data:\n")
print(prop.table(table(y_rf_test)))
```


try again??
```{r}
# Load necessary libraries
library(data.table)
library(caret)
library(pROC)
library(randomForest)

# Load the dataset
RF_impute_df <- fread("RF_imputation_NEW.csv")

# Subset the data to exclude unnecessary columns and ensure the outcome variable is included and properly named
RF_complete_df <- subset(RF_impute_df, select = -c(deathtime, survival_time, LOS, Unnamed_0, V1, admittime, ID, group, tLOS))

# Ensure valid factor levels for outcome
RF_complete_df$outcome <- factor(make.names(RF_complete_df$outcome))

# Verify levels
levels(RF_complete_df$outcome)

# Split the dataset into training and test sets
set.seed(403)
train_idx <- createDataPartition(RF_complete_df$outcome, p = 0.8, list = FALSE)
train_data <- RF_complete_df[train_idx, ]
test_data <- RF_complete_df[-train_idx, ]

# Define predictor matrix and outcome variable for training data
x_train <- as.matrix(train_data[, setdiff(names(train_data), "outcome"), with = FALSE])
y_train <- train_data$outcome

# Define predictor matrix and outcome variable for test data
x_test <- as.matrix(test_data[, setdiff(names(test_data), "outcome"), with = FALSE])
y_test <- as.numeric(test_data$outcome) - 1  # Convert outcome to 0 and 1

# Scale the data
preProc <- preProcess(x_train, method = c("center", "scale"))
x_train_scaled <- predict(preProc, x_train)
x_test_scaled <- predict(preProc, x_test)

# Feature Selection using Recursive Feature Elimination (RFE)
ctrl <- rfeControl(functions = rfFuncs, method = "cv", number = 10)
rfe_fit <- rfe(x_train_scaled, y_train, sizes = c(1:5, 10, 15, 20), rfeControl = ctrl)

# Print the results of RFE
print(rfe_fit)

# Get selected features
selected_features <- predictors(rfe_fit)
cat("Selected Features:", selected_features, "\n")

# Subset the training and test data to the selected features
x_train_selected <- x_train_scaled[, selected_features, drop = FALSE]
x_test_selected <- x_test_scaled[, selected_features, drop = FALSE]

# Random Forest using caret
set.seed(403)
rf_model <- train(
  x = x_train_selected, 
  y = y_train,
  method = "rf",
  trControl = trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary),
  tuneLength = 5
)

# Predict on the test data
rf_pred_probs <- predict(rf_model, newdata = x_test_selected, type = "prob")[,2]
rf_pred_class <- ifelse(rf_pred_probs > 0.5, 1, 0)

# Calculate AUC
rf_roc_obj <- roc(y_test, rf_pred_probs)
rf_auc <- rf_roc_obj$auc
cat("Random Forest AUC:", rf_auc, "\n")

# Plot ROC curve
plot.roc(rf_roc_obj, main = "ROC Curve (Random Forest)")

# Calculate log-loss
log_loss <- function(y_true, y_prob) {
  epsilon <- 1e-15
  y_prob <- pmax(epsilon, pmin(1 - epsilon, y_prob))
  -mean(y_true * log(y_prob) + (1 - y_true) * log(1 - y_prob))
}
rf_log_loss_value <- log_loss(y_test, rf_pred_probs)
cat("Random Forest Log-loss:", rf_log_loss_value, "\n")

# Confusion matrix with adjusted threshold
rf_conf_matrix <- table(Predicted = rf_pred_class, Actual = y_test)
print(rf_conf_matrix)
```