Skip to content

Commit

Permalink
count ambiguous txs
Browse files Browse the repository at this point in the history
  • Loading branch information
mfansler committed Aug 30, 2021
1 parent f78a2d5 commit 211a229
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
12 changes: 12 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -251,3 +251,15 @@ rule mtxs_to_sce_genes:
mem_mb=16000
script:
"scripts/mtxs_to_sce_genes.R"

rule count_ambiguities:
input:
ec="data/kallisto/{target}/{sample_id}/matrix.ec",
txs="data/kallisto/{target}/{sample_id}/transcripts.txt",
merge="data/utrs/{target}/{level}_merge.tsv"
output:
tsv="qc/kallisto/multimapping/{target}/{sample_id}/{level}_multimapping.tsv.gz"
wildcard_constraints:
level="(gene|tx)"
script:
"scripts/count_ambiguities.R"
68 changes: 68 additions & 0 deletions scripts/count_ambiguities.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env Rscript

library(magrittr)
library(tidyverse)

################################################################################
# Fake Data (Interactive Testing)
################################################################################

if (interactive()) {
Snakemake <- setClass("Snakemake", slots=c(input='list', output='list', params='list'))
snakemake <- Snakemake(
input=list(
ec=c("data/kallisto/utrome_mm10_v1/heart_1k_v2_fastq/matrix.ec"),
txs=c("data/kallisto/utrome_mm10_v1/heart_1k_v2_fastq/transcripts.txt"),
merge=c("data/utrs/utrome_mm10_v1/gene_merge.tsv")),
output=list(tsv="/fscratch/fanslerm/test.ambiguity.tsv"),
params=list())
}

################################################################################
# Load Data
################################################################################

tx_map <- read_lines(snakemake@input$txs) %>%
set_names(seq(0, along.with=.))

df_merge <- read_tsv(snakemake@input$merge,
col_names=c("tx_name", "merge_name"),
col_types="cc")

df_ec <- read_tsv(snakemake@input$ec,
col_names=c("ec", "tx"),
col_types="ic") %>%

## expand equivalence classes
separate_rows(tx, sep=",", convert=FALSE) %>%

## map txs to tx_name
mutate(tx_name=tx_map[tx]) %>%

## including merging table
left_join(df_merge, by="tx_name") %>%

## does equivalence class include non-merging transcripts?
group_by(ec) %>%
mutate(is_multimapping=!all(first(merge_name) == merge_name)) %>%
ungroup() %>%

## if so, call such txs "multimapping" and include num of ecs
group_by(tx) %>%
mutate(is_multimapping=any(is_multimapping),
n_ecs=n()) %>%

## does the transcript have ambiguity w/i the merging set?
group_by(tx, merge_name) %>%
mutate(is_overlapping=n() > 1) %>%
ungroup() %>%

## retain tx-level summary
select(merge_name, tx_name, n_ecs, is_multimapping, is_overlapping) %>%
distinct()

################################################################################
# Export table
################################################################################

write_tsv(df_ec, snakemake@output$tsv)

0 comments on commit 211a229

Please sign in to comment.