Skip to content

Commit

Permalink
Merge pull request #165 from nf-core/grohmm-overhaul
Browse files Browse the repository at this point in the history
  • Loading branch information
edmundmiller authored Oct 23, 2024
2 parents af25b3f + 9c65113 commit bb65227
Show file tree
Hide file tree
Showing 36 changed files with 1,249 additions and 517 deletions.
6 changes: 5 additions & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ trim_trailing_whitespace = true
indent_size = 4
indent_style = space

[*.{md,yml,yaml,html,css,scss,js}]
[*.{md,yml,yaml,html,css,scss,js,R,Rmd}]
indent_size = 2

# These files are edited and tested upstream in nf-core/modules
Expand All @@ -31,3 +31,7 @@ indent_size = unset
# ignore python and markdown
[*.{py,md}]
indent_style = unset

# Follow tidyverse style for R
[*.{R,Rmd}]
indent_size = 2
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#137](https://github.com/nf-core/nascent/pull/137) - Use singularity containers for PINTS
- [#142](https://github.com/nf-core/nascent/pull/142) - Updated CHM13 references
- [#171](https://github.com/nf-core/nascent/pull/171) - Use assertAll in tests
- [#165](https://github.com/nf-core/nascent/pull/165) - groHMM overhaul. Removed R mclapply calls and replaced with Nextflow scatter gather for parameter tuning. This creates a job for each parameter set.

### Fixed

- [#170](https://github.com/nf-core/nascent/pull/170) - Remove "Access to undefined parameter forwardStranded" warnings

### Removed

- [[#165](https://github.com/nf-core/nascent/pull/165)] - Removed support for groHMM tuning files.

## v2.2.0 - 2024-03-05

### Added
Expand Down
2 changes: 1 addition & 1 deletion assets/multiqc_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ custom_data:
plot_type: "image"
sp:
grohmm_plot:
fn: "*.tdplot_mqc.jpg"
fn: "*.tdplot_mqc.png"
ignore_images: false

export_plots: true
Expand Down
170 changes: 170 additions & 0 deletions bin/grohmm_parametertuning.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#!/usr/bin/env Rscript

suppressPackageStartupMessages(library(argparse))
suppressPackageStartupMessages(library(GenomicFeatures))
suppressPackageStartupMessages(library(GenomicAlignments))
suppressPackageStartupMessages(library(groHMM))

parser <- ArgumentParser(description = "Run groHMM on some bam files")

parser$add_argument(
"-i",
"--bam_files",
type = "character",
nargs = "+",
metavar = "path",
help = "GRO SEQ data in bam files.",
required = TRUE
)
parser$add_argument(
"-o",
"--outdir",
type = "character",
default = "./",
metavar = "path",
help = "Output directory."
)
parser$add_argument(
"-l",
"--ltprobb",
type = "integer",
default = -200,
metavar = "integer",
help = cat(
"Log-transformed transition probability of switching from transcribed
state to non-transcribed state"
)
)
parser$add_argument(
"-u",
"--uts",
type = "integer",
default = 5,
metavar = "integer",
help = cat(
"Variance of the emission probability for reads in the
non-transcribed state, respectively."
)
)
parser$add_argument(
"-p",
"--outprefix",
type = "character",
default = "grohmm",
metavar = "string",
help = "Output prefix."
)
parser$add_argument(
"-g",
"--gxf",
type = "character",
default = NULL,
metavar = "string",
help = "GFF/GTF File to create TxDb",
required = TRUE
)
parser$add_argument(
"-c",
"--cores",
type = "integer",
default = 1,
metavar = "integer",
help = "Number of cores."
)
parser$add_argument(
"-m",
"--memory",
type = "integer",
metavar = "integer",
help = "Amount of memory in MB"
)

args <- parser$parse_args()

options(mc.cores = getCores(args$cores))
memory.limit(size = args$memory)
setwd(args$outdir)

if (is.null(args$bam_files)) {
print_help(args)
stop("Please provide a bam file", call. = FALSE)
}

# Load alignment files
# TODO? CHANGE BASED ON PAIRED OR SINGLE END
alignments <- c()
for (bam in args$bam_files) {
alignments <- append(
alignments,
as(readGAlignments(bam), "GRanges")
)
alignments <- keepStandardChromosomes(alignments, pruning.mode = "coarse")
}

print("Input transcript annotations")
kg_db <- makeTxDbFromGFF(args$gxf)
kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name"))
print("Collapse annotations in preparation for overlap")
kg_consensus <- makeConsensusAnnotations(
kg_tx,
mc.cores = args$cores
)
print("Finished consensus annotations")

############
## TUNING ##
############
print("Starting tuning run")
tune <- data.frame(
LtProbB = args$ltprobb,
UTS = args$uts
)
fp <- windowAnalysis(alignments, strand = "+", windowSize = 50)
fm <- windowAnalysis(alignments, strand = "-", windowSize = 50)
hmm <- detectTranscripts(
Fp = Fp,
Fm = Fm,
reads = alignments,
LtProbB = args$ltprobb,
UTS = args$uts
)
print("Evaluating")
e <- evaluateHMMInAnnotations(hmm$transcripts, kg_consensus)

# Extract evaluation metrics and convert to a data frame
eval_metrics <- as.data.frame(e$eval)

# If eval_metrics is a list of lists, unlist it
if (is.list(eval_metrics[[1]])) {
eval_metrics <- as.data.frame(t(sapply(e$eval, unlist)))
}

# Combine the tuning parameters with the evaluation metrics
tune <- cbind(tune, eval_metrics)

print(e$eval)
print(e)

# Write the combined data to a CSV file without row names
write.csv(tune, file = paste0(args$outprefix, ".tuning.csv"), row.names = FALSE)
# Write kg_consensus to a bed file for testing
export.bed(kg_consensus, con = paste0(args$outprefix, ".tuning.consensus.bed"))

########################
## CITE PACKAGES USED ##
########################
citation("groHMM")
citation("GenomicFeatures")
citation("GenomicAlignments")
citation("AnnotationDbi")

####################
## R SESSION INFO ##
####################
r_log_file <- "R_sessionInfo.log"
if (file.exists(r_log_file) == FALSE) {
sink(r_log_file)
a <- sessionInfo()
print(a)
sink()
}
Loading

0 comments on commit bb65227

Please sign in to comment.