diff --git a/DESCRIPTION b/DESCRIPTION index c011fdea..d15ea63f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: numbat -Title: Accurate reconstruction of copy number profiles in scRNAseq data using population haplotype phasing +Title: Haplotype-aware CNV anlaysis from scRNA-seq URL: https://github.com/kharchenkolab/numbat, https://kharchenkolab.github.io/numbat -Version: 0.1.3 +Version: 1.0.0 Authors@R: c(person("Teng","Gao", email="tgao@g.harvard.edu", role=c("cre", "aut")), person("Hirak", "Sarkar", email="hirak_sarkar@hms.harvard.edu", role="aut"), person("Evan", "Biederstedt", email="evan.biederstedt@gmail.com", role="aut"), person("Peter", "Kharchenko", email = "peter_kharchenko@hms.harvard.edu", role = "aut")) Description: Numbat is a computational method that infers copy number variations (CNVs) in cancer scRNA-Seq data and reconstructs the tumor phylogeny. The R package integrates signals from gene expression, allelic ratio, and population haplotype structures to accurately infer allele-specific CNVs in single cells and reconstruct their lineage relationship. Numbat can be used to: 1. detect allele-specific copy number variations from single-cells; 2. differentiate tumor versus normal cells in the tumor microenvironment; 3. infer the clonal architecture and evolutionary history of profiled tumors. Numbat does not require tumor/normal-paired DNA or genotype data, but operates solely on the donor scRNA-data data (for example, 10x Cell Ranger output). License: MIT diff --git a/R/main.R b/R/main.R index 33c0470e..33618062 100644 --- a/R/main.R +++ b/R/main.R @@ -235,7 +235,7 @@ run_numbat = function( p = plot_bulks(bulk_subtrees, min_LLR = min_LLR, use_pos = TRUE) ggsave( glue('{out_dir}/bulk_subtrees_{i}.png'), p, - width = 12, height = 2*length(unique(bulk_subtrees$sample)), dpi = 200 + width = 12, height = 2*length(unique(bulk_subtrees$sample)), dpi = 250 ) } @@ -253,7 +253,6 @@ run_numbat = function( bulk_subtrees = retest_bulks( bulk_subtrees, segs_consensus, - # segs_loh = segs_loh, diploid_chroms = diploid_chroms, min_LLR = min_LLR, ncores = ncores @@ -304,7 +303,7 @@ run_numbat = function( p = plot_bulks(bulk_clones, min_LLR = min_LLR, use_pos = TRUE) ggsave( glue('{out_dir}/bulk_clones_{i}.png'), p, - width = 12, height = 2*length(unique(bulk_clones$sample)), dpi = 200 + width = 12, height = 2*length(unique(bulk_clones$sample)), dpi = 250 ) } @@ -474,7 +473,7 @@ run_numbat = function( clone_bar = TRUE ) - ggsave(glue('{out_dir}/panel_{i}.png'), panel, width = 8, height = 3.5, dpi = 200) + ggsave(glue('{out_dir}/panel_{i}.png'), panel, width = 8, height = 3.5, dpi = 250) }, error = function(e) { @@ -518,7 +517,7 @@ run_numbat = function( } } - # Output final subclone bulk profiles + # Output final subclone bulk profiles - logic can be simplified bulk_clones = make_group_bulks( groups = clones, count_mat = count_mat, @@ -535,7 +534,7 @@ run_numbat = function( gamma = gamma, alpha = alpha, min_genes = min_genes, - common_diploid = common_diploid, + common_diploid = FALSE, diploid_chroms = diploid_chroms, segs_loh = segs_loh, ncores = ncores, @@ -556,7 +555,7 @@ run_numbat = function( p = plot_bulks(bulk_clones, min_LLR = min_LLR, use_pos = TRUE) ggsave( glue('{out_dir}/bulk_clones_final.png'), p, - width = 12, height = 2*length(unique(bulk_clones$sample)), dpi = 200 + width = 12, height = 2*length(unique(bulk_clones$sample)), dpi = 250 ) } @@ -763,7 +762,7 @@ run_group_hmms = function( #' @param min_overlap numeric Minimum overlap fraction to determine count two events as as overlapping #' @return dataframe Consensus segments #' @keywords internal -get_segs_consensus = function(bulks, min_LLR = 5, min_overlap = 0.45, retest = FALSE) { +get_segs_consensus = function(bulks, min_LLR = 5, min_overlap = 0.45, retest = TRUE) { if (!'sample' %in% colnames(bulks)) { bulks$sample = 1 @@ -1009,6 +1008,10 @@ get_clone_post = function(gtree, exp_post, allele_post) { #' @return dataframe Consensus CNV segments #' @keywords internal resolve_cnvs = function(segs_all, min_overlap = 0.5, debug = FALSE) { + + if (nrow(segs_all) == 0) { + return(segs_all) + } V = segs_all %>% ungroup() %>% mutate(vertex = 1:n(), .before = 1) diff --git a/R/vis.R b/R/vis.R index e89e801a..493e3e17 100644 --- a/R/vis.R +++ b/R/vis.R @@ -58,7 +58,7 @@ cnv_labels = names(cnv_colors) %>% plot_psbulk = function( bulk, use_pos = TRUE, allele_only = FALSE, min_LLR = 5, min_depth = 8, exp_limit = 2, phi_mle = TRUE, theta_roll = FALSE, dot_size = 0.8, dot_alpha = 0.5, legend = FALSE, - exclude_gap = TRUE + exclude_gap = TRUE, raster = FALSE ) { if (!all(c('state_post', 'cnv_state_post') %in% colnames(bulk))) { @@ -120,24 +120,32 @@ plot_psbulk = function( D, aes(x = get(marker), y = value, color = state_post), na.rm=TRUE - ) + - geom_point( - aes( - shape = str_detect(state_post, '_2'), - alpha = str_detect(state_post, '_2') - ), + ) + + if (use_pos & exclude_gap) { + + segs_exclude = gaps_hg38 %>% + filter(end - start > 1e+06) %>% + rbind(acen_hg38) %>% + rename(seg_start = start, seg_end = end) %>% + filter(CHROM %in% bulk$CHROM) + + if (nrow(segs_exclude) > 0) { + p = p + geom_rect(inherit.aes = F, data = segs_exclude, + aes(xmin = seg_start, xmax = seg_end, ymin = -Inf, ymax = Inf), + fill = "gray95") + } + } + + p = p + geom_point( + aes(shape = str_detect(state_post, '_2'), alpha = str_detect(state_post, '_2')), size = dot_size, na.rm = TRUE ) + geom_hline( data = data.frame(y = c(0,1), variable = 'pHF'), aes(yintercept = y), - size = 0 - ) + - geom_hline( - data = data.frame(y = c(-exp_limit, exp_limit), variable = 'logFC'), - aes(yintercept = y), - size = 0 + size = 0, alpha = 0 ) + scale_alpha_discrete(range = c(dot_alpha, 1)) + scale_shape_manual(values = c(`FALSE` = 16, `TRUE` = 15)) + @@ -147,7 +155,8 @@ plot_psbulk = function( panel.spacing.y = unit(1, 'mm'), panel.border = element_rect(size = 0.5, color = 'gray', fill = NA), strip.background = element_blank(), - axis.text.x = element_blank() + axis.text.x = element_blank(), + axis.ticks.x = element_blank() ) + facet_grid(variable ~ CHROM, scale = 'free', space = 'free_x') + # scale_x_continuous(expand = expansion(add = 5)) + @@ -161,19 +170,18 @@ plot_psbulk = function( xlab(marker) + ylab('') - if (!legend) { - p = p + guides(color = 'none', fill = 'none', alpha = 'none', shape = 'none') + if (!allele_only) { + p = p + geom_hline( + data = data.frame(y = c(-exp_limit, exp_limit), variable = 'logFC'), + aes(yintercept = y), + size = 0, alpha = 0) } - if (use_pos & exclude_gap) { - segs_exclude = gaps_hg38 %>% filter(end - start > 1e+06) %>% - rename(seg_start = start, seg_end = end) - p = p + geom_rect(inherit.aes = F, data = segs_exclude, - aes(xmin = seg_start, xmax = seg_end, ymin = -Inf, - ymax = Inf), fill = "gray95") + if (!legend) { + p = p + guides(color = 'none', fill = 'none', alpha = 'none', shape = 'none') } - if (phi_mle) { + if (phi_mle & (!allele_only)) { segs = bulk %>% distinct(CHROM, seg, seg_start, seg_start_index, seg_end, seg_end_index, phi_mle) %>% mutate(variable = 'logFC') %>% @@ -225,6 +233,10 @@ plot_psbulk = function( } p = p + xlab(marker_label) + + if (raster) { + p = ggrastr::rasterize(p, layers = 'Point', dpi = 300) + } return(p) } @@ -427,7 +439,7 @@ plot_phylo_heatmap = function( annot = NULL, pal_annot = NULL, annot_title = 'Annotation', annot_scale = NULL, clone_dict = NULL, clone_bar = FALSE, pal_clone = NULL, clone_title = 'Genotype', clone_legend = TRUE, p_min = 0.9, line_width = 0.1, tree_height = 1, branch_width = 0.2, tip_length = 0.2, - clone_line = FALSE, superclone = FALSE, exclude_gap = FALSE, root_edge = TRUE, snvs = NULL + clone_line = FALSE, superclone = FALSE, exclude_gap = FALSE, root_edge = TRUE, snvs = NULL, raster = FALSE ) { # make sure chromosomes are in order @@ -573,9 +585,11 @@ plot_phylo_heatmap = function( if (exclude_gap) { - segs_exclude = gaps_hg38 %>% filter(end - start > 1e6) %>% - mutate(CHROM = as.integer(as.character(CHROM))) %>% - rename(seg_start = start, seg_end = end) + segs_exclude = gaps_hg38 %>% + filter(end - start > 1e+06) %>% + # bind_rows(acen_hg38) %>% + rename(seg_start = start, seg_end = end) %>% + mutate(CHROM = as.integer(CHROM)) p_segs = p_segs + geom_rect( @@ -622,7 +636,7 @@ plot_phylo_heatmap = function( ) %>% mutate(cell = factor(cell, cell_order)) %>% filter(!is.na(cell)) %>% - annot_bar(transpose = TRUE, legend = clone_legend, pal_annot = pal_clone, legend_title = clone_title, size = size) + annot_bar(transpose = TRUE, legend = clone_legend, pal_annot = pal_clone, legend_title = clone_title, size = size, raster = raster) # add clone lines if (clone_line) { @@ -651,6 +665,10 @@ plot_phylo_heatmap = function( p_segs = p_segs + geom_hline(yintercept = clone_indices, color = 'darkslategray', size = 0.5, linetype = 'dashed') } + if (raster) { + p_segs = ggrastr::rasterize(p_segs, layers = 'Segment', dpi = 300) + } + # external annotation if (!is.null(annot)) { @@ -660,7 +678,7 @@ plot_phylo_heatmap = function( ) %>% filter(cell %in% cell_order) %>% mutate(cell = factor(cell, cell_order)) %>% - annot_bar(transpose = T, pal_annot = pal_annot, legend_title = annot_title, size = size, annot_scale = annot_scale) + annot_bar(transpose = T, pal_annot = pal_annot, legend_title = annot_title, size = size, annot_scale = annot_scale, raster = raster) if (clone_bar) { (p_tree | p_clone | p_annot | p_segs) + plot_layout(widths = c(tree_height, 0.25, 0.25, 15), guides = 'collect') @@ -872,6 +890,95 @@ plot_sc_tree = function(gtree, label_mut = TRUE, label_size = 3, dot_size = 2, b } +#' Plot CNV heatmap +#' @param segs dataframe Segments to plot. Need columns "seg_start", "seg_end", "cnv_state" +#' @param var character Column to facet by +#' @param label_group logical Label the groups +#' @param legend logical Display the legend +#' @export +cnv_heatmap = function(segs, var = 'group', label_group = TRUE, legend = TRUE, exclude_gap = TRUE) { + + if (!'p_cnv' %in% colnames(segs)) { + segs$p_cnv = 1 + } + + p = ggplot( + segs + ) + + geom_rect( + aes(xmin = seg_start, xmax = seg_end, ymin = -0.5, ymax = 0.5, fill = cnv_state, alpha = p_cnv) + ) + + geom_rect( + data = chrom_sizes_hg38, + aes(xmin = 0, xmax = size, ymin = -0.5, ymax = 0.5, fill = NA) + ) + + if (exclude_gap) { + + segs_exclude = gaps_hg38 %>% + filter(end - start > 1e+06) %>% + rename(seg_start = start, seg_end = end) + + p = p + + geom_rect( + inherit.aes = F, + data = segs_exclude, + aes(xmin = seg_start, + xmax = seg_end, + ymin = -Inf, + ymax = Inf), + fill = 'white' + ) + + geom_rect( + inherit.aes = F, + data = segs_exclude, + aes(xmin = seg_start, + xmax = seg_end, + ymin = -Inf, + ymax = Inf), + fill = 'gray', + alpha = 0.5 + ) + } + + p = p + + theme_classic() + + theme( + panel.spacing = unit(0, 'mm'), + panel.border = element_rect(size = 0.5, color = 'gray', fill = NA), + strip.background = element_blank(), + strip.text.y = element_text(angle = 0), + axis.text = element_blank(), + plot.margin = margin(0, 0, 0, 0), + axis.title.x = element_blank(), + axis.ticks = element_blank(), + axis.line.x = element_blank() + ) + + scale_fill_manual( + values = c('neu' = 'white', cnv_colors[names(cnv_colors) != 'neu']), + na.value = 'white', + limits = force, + labels = cnv_labels, + na.translate = F, + name = 'States' + ) + + scale_alpha_continuous(range = c(0,1), limits = c(0.5,1), oob = scales::squish) + + guides(alpha = 'none') + + scale_x_continuous(expand = expansion(add = 0)) + + facet_grid(get(var)~CHROM, space = 'free_x', scale = 'free', drop = TRUE) + + if (!legend) { + p = p + theme(legend.position = 'none') + } + + if (!label_group) { + p = p + theme(strip.text.y = element_blank()) + } + + return(p) + +} + ########################### Functions for internal use ############################ @@ -1062,14 +1169,14 @@ plot_markers = function(sample, count_mat, cell_annot, markers, clone_post, pal_ } -do_plot = function(p, f, w, h, out_dir = '~/figures') { - ggsave(filename = paste0(out_dir, '/', f, '.png'), plot = p, width = w, height = h, device = 'png', dpi = 300) +do_plot = function(p, f, w, h, out_dir = '~/figures', device = 'pdf') { + ggsave(filename = paste0(out_dir, '/', f, '.', device), plot = p, width = w, height = h, device = device, dpi = 300) options(repr.plot.width = w, repr.plot.height = h, repr.plot.res = 300) print(p) } # expect columns cell and annot -annot_bar = function(D, transpose = FALSE, legend = TRUE, legend_title = '', size = 0.05, pal_annot = NULL, annot_scale = NULL) { +annot_bar = function(D, transpose = FALSE, legend = TRUE, legend_title = '', size = 0.05, pal_annot = NULL, annot_scale = NULL, raster = FALSE) { D = D %>% mutate(cell_index = as.integer(cell)) @@ -1118,6 +1225,10 @@ annot_bar = function(D, transpose = FALSE, legend = TRUE, legend_title = '', siz p = p + guides(fill = 'none') } + if (raster) { + p = ggrastr::rasterize(p, layers = 'Tile', dpi = 300) + } + return(p) } @@ -1228,80 +1339,6 @@ plot_exp_post = function(exp_post, jitter = TRUE) { return(p) } -cnv_heatmap = function(segs, var = 'group', label_group = TRUE, legend = TRUE) { - - gaps_hg38_filtered = gaps_hg38 %>% filter(end - start > 1e6) - - if (!'p_cnv' %in% colnames(segs)) { - segs$p_cnv = 1 - } - - p = ggplot( - segs - ) + - geom_rect( - aes(xmin = seg_start, xmax = seg_end, ymin = -0.5, ymax = 0.5, fill = cnv_state, alpha = p_cnv) - ) + - geom_rect( - data = chrom_sizes_hg38, - aes(xmin = 0, xmax = size, ymin = -0.5, ymax = 0.5, fill = NA) - ) + - geom_rect( - inherit.aes = F, - data = gaps_hg38_filtered, - aes(xmin = start, - xmax = end, - ymin = -Inf, - ymax = Inf), - fill = 'white' - ) + - geom_rect( - inherit.aes = F, - data = gaps_hg38_filtered, - aes(xmin = start, - xmax = end, - ymin = -Inf, - ymax = Inf), - fill = 'gray', - alpha = 0.5 - ) + - theme_classic() + - theme( - panel.spacing = unit(0, 'mm'), - panel.border = element_rect(size = 0.5, color = 'gray', fill = NA), - strip.background = element_blank(), - strip.text.y = element_text(angle = 0), - axis.text = element_blank(), - plot.margin = margin(0, 0, 0, 0), - axis.title.x = element_blank(), - axis.ticks = element_blank(), - axis.line.x = element_blank() - ) + - scale_fill_manual( - values = c('neu' = 'white', cnv_colors[names(cnv_colors) != 'neu']), - na.value = 'white', - limits = force, - labels = cnv_labels, - na.translate = F, - name = 'States' - ) + - scale_alpha_continuous(range = c(0,1), limits = c(0.5,1), oob = scales::squish) + - guides(alpha = 'none') + - scale_x_continuous(expand = expansion(add = 0)) + - facet_grid(get(var)~CHROM, space = 'free_x', scale = 'free', drop = TRUE) - - if (!legend) { - p = p + theme(legend.position = 'none') - } - - if (!label_group) { - p = p + theme(strip.text.y = element_blank()) - } - - return(p) - -} - plot_clone_profile = function(joint_post, clone_post) { clone_profile = get_clone_profile(joint_post, clone_post) diff --git a/data/acen_hg38.rda b/data/acen_hg38.rda new file mode 100644 index 00000000..468e7337 Binary files /dev/null and b/data/acen_hg38.rda differ diff --git a/pileup_and_phase.R b/pileup_and_phase.R index 7e815163..cf51e35b 100644 --- a/pileup_and_phase.R +++ b/pileup_and_phase.R @@ -149,7 +149,7 @@ vcfs = lapply(samples, function(sample){vcfR::read.vcfR(glue('{outdir}/pileup/{s numbat:::genotype(label, samples, vcfs, glue('{outdir}/phasing')) ## phasing -eagle_cmd = function(chr, sample) { +eagle_cmd = function(chr) { paste(eagle, glue('--numThreads {ncores}'), glue('--vcfTarget {outdir}/phasing/{label}_chr{chr}.vcf.gz'), @@ -159,11 +159,7 @@ eagle_cmd = function(chr, sample) { sep = ' ') } -cmds = c() - -for (sample in samples) { - cmds = c(cmds, lapply(1:22, function(chr){eagle_cmd(chr, sample)})) -} +cmds = lapply(1:22, function(chr){eagle_cmd(chr)}) script = glue('{outdir}/run_phasing.sh') diff --git a/raw-data/prep_data.R b/raw-data/prep_data.R index fdb5a490..51d20765 100644 --- a/raw-data/prep_data.R +++ b/raw-data/prep_data.R @@ -71,6 +71,7 @@ chrom_sizes_hg38 = fread('~/ref/hg38.chrom.sizes.txt') %>% mutate(CHROM = factor(as.integer(CHROM))) ## gaps_hg38.rda ## +# https://github.com/hartwigmedical/hmftools/blob/master/purple/src/main/resources/circos/gaps.38.txt gaps_hg38 = fread('~/ref/gaps.38.txt') %>% setNames(c('CHROM', 'start', 'end')) %>% mutate(CHROM = str_remove(CHROM, 'hs')) %>% @@ -78,6 +79,17 @@ gaps_hg38 = fread('~/ref/gaps.38.txt') %>% mutate(CHROM = as.factor(as.integer(CHROM))) %>% arrange(CHROM) +## acen_hg38.rda ## +acen_hg38 = fread('~/ref/chromosome.band.hg38.txt') %>% + rename(CHROM = `#chrom`) %>% + filter(gieStain == 'acen') %>% + mutate(start = chromStart, end = chromEnd) %>% + mutate(CHROM = str_remove(CHROM, 'chr')) %>% + filter(CHROM %in% 1:22) %>% + mutate(CHROM = factor(CHROM, 1:22)) %>% + group_by(CHROM) %>% + summarise(start = min(start), end = max(end)) + ## vcf_meta.rda ## # wget ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG002_NA24385_son/latest/GRCh38/HG002_GRCh38_GIAB_highconf_CG-Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.2_highconf_triophased.vcf.gz # https://github.com/picrin/pysccnv/blob/master/Eagle2_benchmark.ipynb