Merge pull request #53 from adamlilith/daisywheel

`trainGLM()` and `trainNS()`: fix bug when factor levels >2; remove unconverged models
adamlilith · Dec 8, 2024 · 73c829f · 73c829f
2 parents d26a6cd + b65c034
commit 73c829f
Show file tree

Hide file tree

Showing 8 changed files with 390 additions and 161 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: enmSdmX
 Type: Package
 Title: Species Distribution Modeling and Ecological Niche Modeling
-Version: 1.1.10
-Date: 2024-12-03
+Version: 1.2.10
+Date: 2024-12-07
 Authors@R: 
 	c(
 		person(

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,9 @@
-# enmSdmX 1.1.10 2024-12-03
-- `trainNS()` works when the number of predictors is >2 and interactions are allowed between variables (thank you again, Pascal!).  
+# enmSdmX 1.2.10 2024-12-07
+- `trainGLM()` now allows users to remove models that did not converge or had boundary problems. The function also works for factor predictors with > 2 levels. Thank you, A.L.!  
+- `trainNS()` works when the number of predictors is >2 and interactions are allowed between variables. Thank you, P.T.! It also handles factors with >2 levels and allows users to remove models that did not converge or had boundary problems. Thank you, A.L.!
 
 # enmSdmX 1.1.9 2024-11-01
-- `trainGAM()` does not include interaction terms if `interaction` is `NULL`, and does not fail when interactions are included and the number of predictors is >2 (thank you, Pascal!).  
+- `trainGAM()` does not include interaction terms if `interaction` is `NULL`, and does not fail when interactions are included and the number of predictors is >2 (thank you, P.T.!).  
 
 # enmSdmX 1.1.8 2024-10-02
 - `modelSize()` can now tell size of a `ranger` random forest.

diff --git a/R/aaa.r b/R/aaa.r
@@ -6,8 +6,12 @@
 
 	ver <- read.dcf(file=system.file('DESCRIPTION', package=pkg), fields='Version')
 	packageStartupMessage(paste0('This is ', pkg, ' ', ver, '.'))
-	packageStartupMessage(paste0('Back-incompatible changes starting starting with enmSdmX version 1.1.1:'))
+
+	packageStartupMessage(paste0('Back-incompatible changes starting starting with enmSdmX version 1.1.1 (2023-06-11):'))
 	packageStartupMessage(paste0('* trainRF() replaces use of the randomForest package with the faster ranger package, which produces random forests that are statistically equivalent.'))
 	packageStartupMessage(paste0('* geoFold() uses ', dQuote('complete'), ' clustering by default (was ', dQuote('single'), ').'))
 
+	packageStartupMessage(paste0('Back-incompatible changes starting starting with enmSdmX version 1.2.10 (2024-12-07):'))
+	packageStartupMessage(paste0('* By default, trainGLM() and trainNS() remove unconverged models and models with boundary issues.'))
+
 }
diff --git a/R/trainGlm.r b/R/trainGlm.r
@@ -25,6 +25,8 @@
 #'  \item A numeric vector of weights, one per row in \code{data}.
 #' 	\item The name of the column in \code{data} that contains site weights.
 #' }
+#' @param removeInvalid Logical. If \code{TRUE} (default), remove models that either did not converge or have parameter estimates near the boundaries (usually negative or positive infinity). If you run this function with `construct = TRUE` (i.e., construct a "full" model from the best "small" models), then any small model that either did not converge or had parameters that are near the boundary (usually negative or positive infinity) are removed from consideration as terms in "full" model.
+#' @param failIfInvalid Logical. If \code{TRUE} (default), and the "full" model either does not converge or has parameters near the boundary, then the function will fail. If \code{FALSE}, then return \code{NULL} in this case.
 #' @param out Character vector. One or more values:
 #' \itemize{
 #' 	\item	\code{'model'}: Model with the lowest AICc.
@@ -35,7 +37,7 @@
 #' @param verbose Logical. If \code{TRUE} then display progress.
 #' @param ... Arguments to pass to \code{glm}.
 #'
-#' @returns The object that is returned depends on the value of the \code{out} argument. It can be a model object, a data frame, a list of models, or a list of all two or more of these. If \code{scale} is \code{TRUE}, any model object will also have an element named \code{$scale}, which contains the means and standard deviations for predictors that are not factors.
+#' @returns The object that is returned depends on the value of the \code{out} argument. It can be a model object, a data frame, a list of models, or a list of all two or more of these. If \code{scale} is \code{TRUE}, any model object will also have an element named \code{$scale}, which contains the means and standard deviations for predictors that are not factors. The data frame reports the AICc for all of the models, sorted by best to worst. The \code{converged} column indicates whether the model converged ("\code{TRUE}" is good), and the \code{boundary} column whether the model parameters are near the boundary (usually, negative or positive infinity; "\code{FALSE}" is good).
 #'
 #' @seealso \code{\link[stats]{glm}}
 #'
@@ -58,6 +60,8 @@ trainGLM <- function(
 	maxTerms = 8,
 	w = TRUE,
 	family = stats::binomial(),
+	removeInvalid = TRUE,
+	failIfInvalid = TRUE,
 	out = 'model',
 	cores = 1,
 	verbose = FALSE,
@@ -70,6 +74,7 @@ trainGLM <- function(
 	if (FALSE) {
 
 		resp <- 'presBg'
+
 		construct <- TRUE
 		select <- TRUE
 		quadratic <- TRUE
@@ -79,16 +84,13 @@ trainGLM <- function(
 		presPerTermFinal <- 10
 		maxTerms <- 8
 		w <- TRUE
+		removeInvalid <- TRUE
+		failIfInvalid <- TRUE
 		scale <- TRUE
 		family <- stats::binomial()
 		out <- 'model'
 		cores <- 1
 		verbose <- TRUE
-
-		# speed <- TRUE
-		# method <- 'eigen'
-
-		# speed <- FALSE
 		method <- 'glm.fit'
 
 	}
@@ -161,8 +163,9 @@ trainGLM <- function(
 
 		### create vector of terms
 		terms <- preds
-		if (quadratic) terms <- c(terms, .makeQuadsMarginality(preds=preds, n=n, presPerTermInitial=presPerTermInitial))
-		if (interaction) terms <- c(terms, .makeIAsMarginality(preds=preds, n=n, presPerTermInitial=presPerTermInitial))
+		factors <- sapply(data[ , preds, drop = FALSE], is.factor)
+		if (quadratic) terms <- c(terms, .makeQuadsMarginality(preds = preds, n = n, presPerTermInitial = presPerTermInitial, factors = factors))
+		if (interaction) terms <- c(terms, .makeIAsMarginality(preds = preds, n = n, presPerTermInitial = presPerTermInitial))
 
 	## term-by-term model construction
 	##################################
@@ -191,11 +194,29 @@ trainGLM <- function(
 			)
 		}
 
-		assess <- assess[order(assess$AICc), , drop=FALSE]
+		if (removeInvalid) {
+
+			bads <- which(!assess$converged | assess$boundary)
+			if (length(bads) > 0) {
+				assess <- assess[-bads, , drop = FALSE]
+
+				if (nrow(assess) == 0) {
+					msg <- 'No single-term models converged or all models had parameter estimates near the boundary.'
+					if (failIfInvalid) {
+						stop(msg)
+					} else {
+						warning(msg)
+						return(NULL)
+					}
+				}
+			}
+		}
+
+		assess <- assess[order(assess$AICc), , drop = FALSE]
 		rownames(assess) <- NULL
 
 		if (verbose) {
-			omnibus::say('Term-by-term evaluation:', pre=1)
+			omnibus::say('Term-by-term evaluation:', pre = 1)
 			print(assess)
 			utils::flush.console()
 		}
@@ -229,34 +250,56 @@ trainGLM <- function(
 			}
 
 			thisForm <- paste0(resp, ' ~ 1 + ', form)
-			numTerms <- lengths(regmatches(thisForm, gregexpr('\\+', thisForm))) + 1
-			start <- rep(0, numTerms)
+			thisForm <- stats::as.formula(thisForm)
 
-			model <- stats::glm(
-				formula = stats::as.formula(thisForm),
+			mm <- stats::model.matrix(thisForm, data)
+			start <- rep(0, ncol(mm))
+
+			model <- suppressWarnings(stats::glm(
+				formula = thisForm,
 				family = family,
 				data = data,
 				method = method,
 				weights = w,
 				start = start,
 				...
-			)
+			))
 
 			AICc <- AICcmodavg::AICc(model)
 
 			tuning <- data.frame(
 				model = form,
+				converged = model$converged,
+				boundary = model$boundary,
 				AICc = AICc
 			)
 
 			models <- NULL
 
+			if (!tuning$converged | tuning$boundary) {
+
+				msg <- 'The model did not converge and/or estimates are near boundary conditions.'
+				if (removeInvalid) {
+
+					if (failIfInvalid) {
+						stop(msg)
+					} else {
+						warning(msg)
+						return(NULL)
+					}
+				} else {
+					warning(msg)
+				}
+
+			}
+
+
 			if (verbose) {
 
 				omnibus::say('Final model (construction from best terms, but no selection):', pre=1)
 				print(summary(model))
 				utils::flush.console()
-				
+
 			}
 
 		### model selection
@@ -373,15 +416,44 @@ trainGLM <- function(
 			}
 
 			# compile all possible models and rank
-			tuning <- data.frame(model = rep(NA_character_, length(assess)), AICc = rep(NA_real_, length(assess)))
+			n <- length(assess)
+			tuning <- data.frame(
+				model = rep(NA_character_, n),
+				converged = rep(NA, n),
+				boundary = rep(NA, n),
+				AICc = rep(NA_real_, n)
+			)
+
 			models <- list()
 			for (i in seq_along(assess)) {
 				models[[i]] <- assess[[i]]$model
 				if (!is.na(scale)) if (scale) models[[i]]$scale <- scales
 				tuning$model[i] <- assess[[i]]$formula
+				tuning$converged[i] <- assess[[i]]$converged
+				tuning$boundary[i] <- assess[[i]]$boundary
 				tuning$AICc[i] <- assess[[i]]$AICc
 			}
 
+			if (removeInvalid) {
+
+				bads <- which(!tuning$converged | tuning$boundary)
+				if (length(bads) > 0) {
+
+					tuning <- tuning[-bads, , drop = FALSE]
+					models <- models[-bads]
+
+					if (nrow(tuning) == 0) {
+						msg <- 'No models converged or all had parameter estimates near the boundary of parameter space.'
+						if (failIfInvalid) {
+							stop(msg)
+						} else {
+							warning(msg)
+							return(NULL)
+						}
+					}
+				}
+			}
+
 			ranks <- order(tuning$AICc)
 			models <- models[ranks]
 			tuning <- tuning[ranks, , drop=FALSE]
@@ -406,30 +478,50 @@ trainGLM <- function(
 		form <- unique(form)
 		form <- paste(form, collapse = ' + ')
 		thisForm <- paste0(resp, ' ~ 1 + ', form)
-		numTerms <- lengths(regmatches(thisForm, gregexpr('\\+', thisForm))) + 1
-		start <- rep(0, numTerms)
+		thisForm <- stats::as.formula(thisForm)
+
+		mm <- stats::model.matrix(thisForm, data)
+		start <- rep(0, ncol(mm))
 
-		model <- stats::glm(
-			formula = stats::as.formula(thisForm),
+		model <- suppressWarnings(stats::glm(
+			formula = thisForm,
 			family = family,
 			data = data,
 			method = method,
 			weights = w,
 			start = start,
 			...
-		)
+		))
 
 		AICc <- AICcmodavg::AICc(model)
 
 		tuning <- data.frame(
 			model = form,
+			converged = model$converged,
+			boundary = model$boundary,
 			AICc = AICc
 		)
 
 		models <- NULL
 
-		if (select) warning('Model selection is not performed when argument "construct" is FALSE.')
+		if (select) warning('Model selection is not performed when argument `construct` is FALSE.')
+		if (!model$converged | model$boundary) {
+
+			msg <- 'The model did not converge and/or parameters are near the boundary space.'
+			if (removeInvalid) {
+
+				if (failIfInvalid) {
+					stop(msg)
+				} else {
+					warning(msg)
+					return(NULL)
+				}
+			} else {
+				warning(msg)
+			}
 
+		}
+
 		if (verbose) {
 
 			omnibus::say('Model (no construction or selection):', level=2)
@@ -454,6 +546,7 @@ trainGLM <- function(
 		output <- models
 	} else if ('model' %in% out) {
 		output <- model
+		if (!tuning$converged[1] | tuning$boundary[1]) warning('The top model did not converge and/or had parameter estimates near the boundary.')
 	} else if ('tuning' %in% out) {
 		output <- tuning
 	}
@@ -497,18 +590,19 @@ trainGLM <- function(
 		}
 	}
 	thisForm <- paste0(resp, ' ~ ', form)
-	numTerms <- lengths(regmatches(thisForm, gregexpr('\\+', thisForm))) + 1
-	start <- rep(0, numTerms)
+	thisForm <- stats::as.formula(thisForm)
+	mm <- stats::model.matrix(thisForm, data)
+	start <- rep(0, ncol(mm))
 
-	model <- stats::glm(
+	model <- suppressWarnings(stats::glm(
 		formula = stats::as.formula(thisForm),
 		data = data,
 		family = family,
 		weights = w,
 		method = method,
 		start = start,
 		...
-	)
+	))
 
 	AICc <- AICcmodavg::AICc(model)
 
@@ -519,6 +613,8 @@ trainGLM <- function(
 			list(
 				model = model,
 				formula = form,
+				converged = model$converged,
+				boundary = model$boundary,
 				AICc = AICc
 			)
 		)
@@ -527,6 +623,8 @@ trainGLM <- function(
 
 		out <- data.frame(
 			formula = form,
+			converged = model$converged,
+			boundary = model$boundary,
 			AICc = AICc
 		)
 
@@ -536,11 +634,21 @@ trainGLM <- function(
 }
 
 # make vector of quadratic terms respecting marginality
-.makeQuadsMarginality <- function(preds, n, presPerTermInitial) {
+.makeQuadsMarginality <- function(
+	preds,					# vector of predictor names
+	n,						# sample size
+	presPerTermInitial,		# number of presences per term
+	factors					# logical: is each predictor a factor?
+) {
 
 	quads <- character()
 	if (n >= 2 * presPerTermInitial) {
-		for (i in seq_along(preds)) quads <- c(quads, paste0(preds[i], ' + I(', preds[i], '^2)'))
+		for (i in seq_along(preds)) {
+			pred <- preds[i]
+			if (!factors[[pred]]) {
+				quads <- c(quads, paste0(pred, ' + I(', pred, '^2)'))
+			}
+		}
 	}
 	quads