-
Notifications
You must be signed in to change notification settings - Fork 4
/
introns.smk
139 lines (112 loc) · 6.04 KB
/
introns.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
rule makeIntrons:
input: "output/mappings/readBedToGff/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.gff.gz"
output: "output/mappings/makeIntrons/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.introns.gff.gz"
shell:
'''
uuidTmpOut=$(uuidgen)
zcat {input} | makeIntrons.pl - | sort -T {TMPDIR} -k1,1 -k4,4n -k5,5n |gzip> {TMPDIR}/$uuidTmpOut
mv {TMPDIR}/$uuidTmpOut {output}
'''
rule getIntronMotif:
input:
introns = "output/mappings/makeIntrons/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.introns.gff.gz",
genome = lambda wildcards: config["GENOMESDIR"] + CAPDESIGNTOGENOME[wildcards.capDesign] + ".sorted.fa"
output:
gff = "output/mappings/getIntronMotif/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.introns.gff.gz",
tsv = "output/mappings/getIntronMotif/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.transcripts.tsv.gz"
conda: "envs/perl_env.yml"
shell:
'''
uuid=$(uuidgen)
mkdir -p {TMPDIR}/$uuid
zcat {input.introns} | grep -vP "^ERCC"| extract_intron_strand_motif.pl - {input.genome} {TMPDIR}/$uuid/$(basename {output.gff} .introns.gff.gz)
gzip {TMPDIR}/$uuid/*
mv {TMPDIR}/$uuid/* $(dirname {output.gff})
'''
rule getGencodeSpliceJunctions:
input: lambda wildcards: GENOMETOANNOTGTF[CAPDESIGNTOGENOME[wildcards.capDesign]]
output: "output/annotations/spliceJunctions/{capDesign}.gencode.spliceJunctions.list"
shell:
'''
uuidTmpOut=$(uuidgen)
cat {input} | awk '$3=="exon"' |sort -T {TMPDIR} -k1,1 -k4,4n -k5,5n | makeIntrons.pl - | awk '{{print $1"_"$4"_"$5"_"$7}}' |sort -T {TMPDIR} |uniq > {TMPDIR}/$uuidTmpOut
mv {TMPDIR}/$uuidTmpOut {output}
'''
rule getClsSpliceJunctions:
input:"output/mappings/mergedReads/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.HiSS.tmerge.min{minReadSupport}reads.splicing_status-spliced.endSupport-all.gff.gz"
output: "output/mappings/mergedReads/spliceJunctions/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.tmerge.min{minReadSupport}reads.spliceJunctions.list"
shell:
'''
uuidTmpOut=$(uuidgen)
zcat {input} | awk '$3=="exon"' |sort -T {TMPDIR} -k1,1 -k4,4n -k5,5n | makeIntrons.pl - | awk '{{print $1"_"$4"_"$5"_"$7}}' |sort -T {TMPDIR} |uniq > {TMPDIR}/$uuidTmpOut
mv {TMPDIR}/$uuidTmpOut {output}
'''
rule getCompareClsGencodeSJsStats:
input:
gencodeSJs="output/annotations/spliceJunctions/{capDesign}.gencode.spliceJunctions.list",
clsSJs="output/mappings/mergedReads/spliceJunctions/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.tmerge.min{minReadSupport}reads.spliceJunctions.list"
output: "output/statsFiles/" + "tmp/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.tmerge.min{minReadSupport}reads.vs.Gencode.SJs.stats.tsv"
shell:
'''
uuidTmpOut=$(uuidgen)
#annSJs=$(cat {input.gencodeSJs} | wc -l)
clsSJs=$(cat {input.clsSJs} | wc -l)
commonSJs=$(comm -1 -2 {input.gencodeSJs} {input.clsSJs} | wc -l)
novelSJs=$(comm -1 -3 {input.gencodeSJs} {input.clsSJs} | wc -l)
echo -e "{wildcards.techname}\t{wildcards.capDesign}\t{wildcards.sizeFrac}\t{wildcards.sampleRep}\t$clsSJs\t$commonSJs\t$novelSJs" > {TMPDIR}/$uuidTmpOut
mv {TMPDIR}/$uuidTmpOut {output}
'''
rule aggCompareClsGencodeSJsStats:
input: lambda wildcards: expand("output/statsFiles/" +"tmp/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.tmerge.min{minReadSupport}reads.vs.Gencode.SJs.stats.tsv",filtered_product, techname=TECHNAMES, capDesign=CAPDESIGNS, sizeFrac=SIZEFRACS, sampleRep=SAMPLEREPS, minReadSupport=wildcards.minReadSupport)
output: "output/statsFiles/" + "all.tmerge.min{minReadSupport}reads.vs.Gencode.SJs.stats.tsv"
shell:
'''
uuidTmpOut=$(uuidgen)
echo -e "seqTech\tcapDesign\tsizeFrac\tsampleRep\tcategory\tcount\tpercent" > {TMPDIR}/$uuidTmpOut
cat {input} | awk '{{ print $1"\\t"$2"\\t"$3"\\t"$4"\\tcommon\\t"$6"\t"$6/$5"\\n"$1"\\t"$2"\\t"$3"\\t"$4"\\tnovel\\t"$7"\t"$7/$5}}' | sort -T {TMPDIR} >> {TMPDIR}/$uuidTmpOut
mv {TMPDIR}/$uuidTmpOut {output}
'''
rule plotCompareClsGencodeSJsStats:
input: "output/statsFiles/" + "all.tmerge.min{minReadSupport}reads.vs.Gencode.SJs.stats.tsv"
output: returnPlotFilenames("output/plots/" + "tmerge.vs.Gencode.SJs.stats/{techname}/{capDesign}/{techname}_{capDesign}_{sizeFrac}_{sampleRep}.tmerge.min{minReadSupport}reads.vs.Gencode.SJs.stats")
conda: "envs/R_env.yml"
params:
filterDat=lambda wildcards: multi_figures(wildcards.capDesign, wildcards.sizeFrac, wildcards.sampleRep, wildcards.techname)
shell:
'''
echo "
library(ggplot2)
library(cowplot)
library(plyr)
library(scales)
library(gridExtra)
library(grid)
library(ggplotify)
library(data.table)
dat <- read.table('{input}', header=T, as.is=T, sep='\\t')
{params.filterDat[technameFilterString]}
{params.filterDat[capDesignFilterString]}
{params.filterDat[sizeFracFilterString]}
{params.filterDat[sampleRepFilterString]}
{params.filterDat[substSeqTechString]}
{params.filterDat[substSampleRepString]}
{params.filterDat[graphDimensions]}
dat\$category<-factor(dat\$category, ordered=TRUE, levels=rev(c('common', 'novel')))
plotBase <- \\"p <- ggplot(dat[order(dat\$category), ], aes(x=1, y=count, fill=category)) +
geom_bar(stat='identity') +
scale_fill_manual (values=c(annOnly='#7D96A2',common='#83A458', novel='#B8CF7E'), labels=c(annOnly='Only in GENCODE', common='In sample+GENCODE', novel='Only in sample')) +
ylab('# Splice Junctions')+
geom_text(position = 'stack', size=geom_textSize, aes(y = count, label = paste(sep='',percent(round(percent, digits=2)),' / ','(',comma(count),')'), hjust = 0.5, vjust = 1))+
{params.filterDat[hideXaxisLabels]}
{GGPLOT_PUB_QUALITY} +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank()) +
\\"
{params.filterDat[facetPlotSetup]}
save_plot('{output[0]}', legendOnly, base_width=wLegendOnly, base_height=hLegendOnly)
save_plot('{output[1]}', pXy, base_width=wXyPlot, base_height=hXyPlot)
save_plot('{output[2]}', pXyNoLegend, base_width=wXyNoLegendPlot, base_height=hXyNoLegendPlot)
save_plot('{output[3]}', pYx, base_width=wYxPlot, base_height=hYxPlot)
save_plot('{output[4]}', pYxNoLegend, base_width=wYxNoLegendPlot, base_height=hYxNoLegendPlot)
" > $(dirname {output[0]})/$(basename {output[0]} .legendOnly.png).r
cat $(dirname {output[0]})/$(basename {output[0]} .legendOnly.png).r | R --slave
'''