Merge branch 'devel' into main

kharchenkolab · Jun 5, 2023 · 579b2f9 · 579b2f9
2 parents a367fa5 + 6989a3e
commit 579b2f9
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 18 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: numbat
 Title: Haplotype-Aware CNV Analysis from scRNA-Seq
 URL: https://github.com/kharchenkolab/numbat/, https://kharchenkolab.github.io/numbat/
-Version: 1.3.1
+Version: 1.3.2
 Authors@R: c(person("Teng","Gao", email="[email protected]", role=c("cre", "aut")), person("Ruslan", "Soldatov", email="[email protected]", role="aut"), person("Hirak", "Sarkar", email="[email protected]", role="aut"), person("Evan", "Biederstedt", email="[email protected]", role="aut"), person("Peter", "Kharchenko", email = "[email protected]", role = "aut"))
 Description: A computational method that infers copy number variations (CNVs) in cancer scRNA-seq data and reconstructs the tumor phylogeny. 'numbat' integrates signals from gene expression, allelic ratio, and population haplotype structures to accurately infer allele-specific CNVs in single cells and reconstruct their lineage relationship. 'numbat' can be used to: 1. detect allele-specific copy number variations from single-cells; 2. differentiate tumor versus normal cells in the tumor microenvironment; 3. infer the clonal architecture and evolutionary history of profiled tumors. 'numbat' does not require tumor/normal-paired DNA or genotype data, but operates solely on the donor scRNA-data data (for example, 10x Cell Ranger output). Additional examples and documentations are available at <https://kharchenkolab.github.io/numbat/>. For details on the method please see Gao et al. Nature Biotechnology (2022) <doi:10.1038/s41587-022-01468-y>.
 License: MIT + file LICENSE

diff --git a/R/diagnostics.R b/R/diagnostics.R
@@ -80,8 +80,15 @@ check_allele_df = function(df) {
         log_error(msg)
         stop(msg)
     }
+
+    # check chrom prefix 
+    if (any(str_detect(df$CHROM[1], '^chr'))) {
+        df = df %>% mutate(CHROM = str_remove(CHROM, 'chr'))
+    } 
 
-    df = df %>% mutate(CHROM = factor(CHROM, 1:22))
+    df = df %>% 
+        filter(CHROM %in% 1:22) %>%
+        mutate(CHROM = factor(CHROM, 1:22))
 
     return(df)
 
@@ -143,11 +150,25 @@ check_exp_ref = function(lambdas_ref) {
     if (!is.matrix(lambdas_ref)) {
         lambdas_ref = as.matrix(lambdas_ref) %>% magrittr::set_colnames('ref')
     }
+
+    # check if all entries in the reference profile are integers
+    if (all(lambdas_ref == as.integer(lambdas_ref))) {
+        msg = "The reference expression matrix 'lambdas_ref' should be normalized gene expression magnitudes. Please use aggregate_counts() function to prepare the reference profile from raw counts."
+        log_error(msg)
+        stop(msg)
+    } else if (any(duplicated(rownames(lambdas_ref)))) {
+        msg = "Please remove duplicated genes in reference profile"
+        log_error(msg)
+        stop(msg)
+    }
+
 
     return(lambdas_ref)
 
 }
 
+
+
 #' check inter-individual contamination
 #' @param bulk dataframe Pseudobulk profile
 #' @return NULL

diff --git a/R/main.R b/R/main.R
@@ -836,9 +836,8 @@ run_group_hmms = function(
     bad = sapply(results, inherits, what = "try-error")
 
     if (any(bad)) {
-        log_error(glue('job {paste(which(bad), collapse = ",")} failed'))
         log_error(results[bad][[1]])
-        message(results[bad][[1]])
+        stop(results[bad][[1]])
     }
 
     bulks = results %>% bind_rows() %>%

diff --git a/R/utils.R b/R/utils.R
@@ -924,17 +924,24 @@ phi_hat_roll = function(Y_obs, lambda_ref, d_obs, mu, sig, h) {
 #' @return vector of alphabetical postfixes
 #' @keywords internal
 generate_postfix <- function(n) {
-  alphabet <- letters
-  postfixes <- sapply(n, function(i) {
-    postfix <- character(0)
-    while (i > 0) {
-      remainder <- (i - 1) %% 26
-      i <- (i - 1) %/% 26
-      postfix <- c(alphabet[remainder + 1], postfix)
+
+    if (any(is.na(n))) {
+        stop("Segment number cannot contain NA")
     }
-    paste(postfix, collapse = "")
-  })
-  return(postfixes)
+
+    alphabet <- letters
+
+    postfixes <- sapply(n, function(i) {
+        postfix <- character(0)
+        while (i > 0) {
+            remainder <- (i - 1) %% 26
+            i <- (i - 1) %/% 26
+            postfix <- c(alphabet[remainder + 1], postfix)
+        }
+        paste(postfix, collapse = "")
+    })
+
+    return(postfixes)
 }
 
 #' Annotate copy number segments after HMM decoding 
@@ -997,7 +1004,8 @@ annot_haplo_segs = function(bulk) {
 #' @return dataframe Pseudobulk profile
 #' @keywords internal
 smooth_segs = function(bulk, min_genes = 10) {
-    bulk %>% group_by(seg) %>%
+
+    bulk = bulk %>% group_by(seg) %>%
         mutate(
             cnv_state = ifelse(n_genes <= min_genes, NA, cnv_state)
         ) %>%
@@ -1006,6 +1014,17 @@ smooth_segs = function(bulk, min_genes = 10) {
         mutate(cnv_state = zoo::na.locf(cnv_state, fromLast = FALSE, na.rm=FALSE)) %>%
         mutate(cnv_state = zoo::na.locf(cnv_state, fromLast = TRUE, na.rm=FALSE)) %>%
         ungroup()
+
+    chrom_na = bulk %>% group_by(CHROM) %>% summarise(all_na = all(is.na(cnv_state)))
+
+    if (any(chrom_na$all_na)) {
+        chroms_na = paste0(chrom_na$CHROM[chrom_na$all_na], collapse = ',')
+        msg = glue("No segments containing more than {min_genes} genes for CHROM {chroms_na}.")
+        log_error(msg)
+        stop(msg)
+    }
+
+    return(bulk)
 }
 
 #' Annotate a consensus segments on a pseudobulk dataframe
@@ -1101,7 +1120,7 @@ find_common_diploid = function(
     }
 
     # define balanced regions in each sample
-    bulks = mclapply(
+    results = mclapply(
         bulks %>% split(.$sample),
         mc.cores = ncores,
         function(bulk) {
@@ -1123,8 +1142,16 @@ find_common_diploid = function(
                 smooth_segs(min_genes = min_genes) %>%
                 annot_segs(var = 'cnv_state')
 
-        }) %>%
-        bind_rows()
+        })
+
+    bad = sapply(results, inherits, what = "try-error")
+
+    if (any(bad)) {
+        log_error(results[bad][[1]])
+        stop(results[bad][[1]])
+    } else {
+        bulks = results %>% bind_rows()
+    }
 
     # always exclude clonal LOH regions if any
     if (any(bulks$loh)) {

diff --git a/man/likelihood_allele.Rd b/man/likelihood_allele.Rd