Merge pull request #165 from nf-core/grohmm-overhaul

nf-core · Oct 23, 2024 · bb65227 · bb65227
2 parents af25b3f + 9c65113
commit bb65227
Show file tree

Hide file tree

Showing 36 changed files with 1,249 additions and 517 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -8,7 +8,7 @@ trim_trailing_whitespace = true
 indent_size = 4
 indent_style = space
 
-[*.{md,yml,yaml,html,css,scss,js}]
+[*.{md,yml,yaml,html,css,scss,js,R,Rmd}]
 indent_size = 2
 
 # These files are edited and tested upstream in nf-core/modules
@@ -31,3 +31,7 @@ indent_size = unset
 # ignore python and markdown
 [*.{py,md}]
 indent_style = unset
+
+# Follow tidyverse style for R
+[*.{R,Rmd}]
+indent_size = 2
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,11 +22,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#137](https://github.com/nf-core/nascent/pull/137) - Use singularity containers for PINTS
 - [#142](https://github.com/nf-core/nascent/pull/142) - Updated CHM13 references
 - [#171](https://github.com/nf-core/nascent/pull/171) - Use assertAll in tests
+- [#165](https://github.com/nf-core/nascent/pull/165) - groHMM overhaul. Removed R mclapply calls and replaced with Nextflow scatter gather for parameter tuning. This creates a job for each parameter set.
 
 ### Fixed
 
 - [#170](https://github.com/nf-core/nascent/pull/170) - Remove "Access to undefined parameter forwardStranded" warnings
 
+### Removed
+
+- [[#165](https://github.com/nf-core/nascent/pull/165)] - Removed support for groHMM tuning files.
+
 ## v2.2.0 - 2024-03-05
 
 ### Added

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -50,7 +50,7 @@ custom_data:
     plot_type: "image"
 sp:
   grohmm_plot:
-    fn: "*.tdplot_mqc.jpg"
+    fn: "*.tdplot_mqc.png"
 ignore_images: false
 
 export_plots: true

diff --git a/bin/grohmm_parametertuning.R b/bin/grohmm_parametertuning.R
@@ -0,0 +1,170 @@
+#!/usr/bin/env Rscript
+
+suppressPackageStartupMessages(library(argparse))
+suppressPackageStartupMessages(library(GenomicFeatures))
+suppressPackageStartupMessages(library(GenomicAlignments))
+suppressPackageStartupMessages(library(groHMM))
+
+parser <- ArgumentParser(description = "Run groHMM on some bam files")
+
+parser$add_argument(
+  "-i",
+  "--bam_files",
+  type = "character",
+  nargs = "+",
+  metavar = "path",
+  help = "GRO SEQ data in bam files.",
+  required = TRUE
+)
+parser$add_argument(
+  "-o",
+  "--outdir",
+  type = "character",
+  default = "./",
+  metavar = "path",
+  help = "Output directory."
+)
+parser$add_argument(
+  "-l",
+  "--ltprobb",
+  type = "integer",
+  default = -200,
+  metavar = "integer",
+  help = cat(
+    "Log-transformed transition probability of switching from transcribed
+        state to non-transcribed state"
+  )
+)
+parser$add_argument(
+  "-u",
+  "--uts",
+  type = "integer",
+  default = 5,
+  metavar = "integer",
+  help = cat(
+    "Variance of the emission probability for reads in the
+        non-transcribed state, respectively."
+  )
+)
+parser$add_argument(
+  "-p",
+  "--outprefix",
+  type = "character",
+  default = "grohmm",
+  metavar = "string",
+  help = "Output prefix."
+)
+parser$add_argument(
+  "-g",
+  "--gxf",
+  type = "character",
+  default = NULL,
+  metavar = "string",
+  help = "GFF/GTF File to create TxDb",
+  required = TRUE
+)
+parser$add_argument(
+  "-c",
+  "--cores",
+  type = "integer",
+  default = 1,
+  metavar = "integer",
+  help = "Number of cores."
+)
+parser$add_argument(
+  "-m",
+  "--memory",
+  type = "integer",
+  metavar = "integer",
+  help = "Amount of memory in MB"
+)
+
+args <- parser$parse_args()
+
+options(mc.cores = getCores(args$cores))
+memory.limit(size = args$memory)
+setwd(args$outdir)
+
+if (is.null(args$bam_files)) {
+  print_help(args)
+  stop("Please provide a bam file", call. = FALSE)
+}
+
+# Load alignment files
+# TODO? CHANGE BASED ON PAIRED OR SINGLE END
+alignments <- c()
+for (bam in args$bam_files) {
+  alignments <- append(
+    alignments,
+    as(readGAlignments(bam), "GRanges")
+  )
+  alignments <- keepStandardChromosomes(alignments, pruning.mode = "coarse")
+}
+
+print("Input transcript annotations")
+kg_db <- makeTxDbFromGFF(args$gxf)
+kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name"))
+print("Collapse annotations in preparation for overlap")
+kg_consensus <- makeConsensusAnnotations(
+  kg_tx,
+  mc.cores = args$cores
+)
+print("Finished consensus annotations")
+
+############
+## TUNING ##
+############
+print("Starting tuning run")
+tune <- data.frame(
+  LtProbB = args$ltprobb,
+  UTS = args$uts
+)
+fp <- windowAnalysis(alignments, strand = "+", windowSize = 50)
+fm <- windowAnalysis(alignments, strand = "-", windowSize = 50)
+hmm <- detectTranscripts(
+  Fp = Fp,
+  Fm = Fm,
+  reads = alignments,
+  LtProbB = args$ltprobb,
+  UTS = args$uts
+)
+print("Evaluating")
+e <- evaluateHMMInAnnotations(hmm$transcripts, kg_consensus)
+
+# Extract evaluation metrics and convert to a data frame
+eval_metrics <- as.data.frame(e$eval)
+
+# If eval_metrics is a list of lists, unlist it
+if (is.list(eval_metrics[[1]])) {
+  eval_metrics <- as.data.frame(t(sapply(e$eval, unlist)))
+}
+
+# Combine the tuning parameters with the evaluation metrics
+tune <- cbind(tune, eval_metrics)
+
+print(e$eval)
+print(e)
+
+# Write the combined data to a CSV file without row names
+write.csv(tune, file = paste0(args$outprefix, ".tuning.csv"), row.names = FALSE)
+# Write kg_consensus to a bed file for testing
+export.bed(kg_consensus, con = paste0(args$outprefix, ".tuning.consensus.bed"))
+
+########################
+## CITE PACKAGES USED ##
+########################
+citation("groHMM")
+citation("GenomicFeatures")
+citation("GenomicAlignments")
+citation("AnnotationDbi")
+
+####################
+## R SESSION INFO ##
+####################
+r_log_file <- "R_sessionInfo.log"
+if (file.exists(r_log_file) == FALSE) {
+  sink(r_log_file)
+  a <- sessionInfo()
+  print(a)
+  sink()
+}