-
Notifications
You must be signed in to change notification settings - Fork 2
/
dismiss_header.R
189 lines (176 loc) · 8.42 KB
/
dismiss_header.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# Copyright (C) 2014-2015 Umar Niazi
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>
# File: dismiss_header.R
# Auth: [email protected], [email protected]
# DESC: header file for user defined functions - source before calling other scripts script
# Date: 24/10/2014
# NOTE: call function f_oGRSeparateStrands if using your own main scripts for strand separation
# load the libraries
if (!require(GenomicAlignments) || !require(GenomicFeatures) || !require(rtracklayer)) {
stop(paste('Bioconductor libraries GenomicAlignments, GenomicFeatures and rtracklayer required\n
visit http://www.bioconductor.org/install/ for instructions.'))}
## functions used by script
# NAME: f_bComparePoissonLiklihoodApply
# ARGS: a vector with 2 components
# DESC: if the event i.e. total number of reads on a signal interval can be divided into
# 2 subevents - then we can see if the difference between these 2 subevents
# is significant or not
# RETS: TRUE - if the 2 rates are equal
f_bComparePoissonLiklihoodApply = function(ivVector, cutoff=0.2) {
# check page 293 of introduction to statistical thought
# for details of how to divide a poisson event into 2
iLam.1 = ivVector[1]
iLam.2 = ivVector[2]
iLam.total = iLam.1 + iLam.2
# the event lambda total is composed of 2 subevents
# these subevents have a binomial distribution
x = 0:iLam.total
# liklihood of first distribution
lik.1 = dpois(iLam.1, lambda=x)
lik.1 = lik.1/max(lik.1)
lik.2 = dpois(iLam.2, lambda=x)
lik.2 = lik.2/max(lik.2)
# if any of the values match or intersect between the 2
# distributions then we can say they are not different from
# one another
x.1 = x[which(lik.1 > cutoff)]
x.2 = x[which(lik.2 > cutoff)]
if (length(x.1[x.1 %in% x.2]) > 0) return(TRUE) else return(FALSE)
} #function
# Function: f_mFastCountMateReadsOverRanges
# Desc: it is a fast version of the function CountMateReadsOverRanges, but will use more memory
# it uses the granges object (typically calculated by a peak caller like MACS)
# and a bam file name, and counts how many reads aligned from first mate and both mates
# over those ranges.
# Args: gr.signal: granges object containing the coordinates for signals in genome
# bam.file: the bam file used for alignment and peak calling
# bPaired: boolean value, default is true. if data was single end read then set it false
# as in single end reads there are no first mate reads
# Rets: a matrix N X 4 size (N being length of GRanges signal object)
f_mFastCountMateReadsOverRanges = function(gr.signal, bam.file, bPaired=T){
# matrix to store return object
mRets = matrix(NA, nrow=length(gr.signal), ncol=4, dimnames=list(NULL, c('mp', 'mm', 'bp', 'bm')))
# set flag to read bam file first mates
flag = scanBamFlag(isFirstMateRead=T)
if (bPaired){ bam = readGAlignments(bam.file, param=ScanBamParam(flag=flag, which=gr.signal))
# how many reads align with + strand and - strands
mRets[,'mp'] = countOverlaps(gr.signal, bam[strand(bam)=='+'])
mRets[,'mm'] = countOverlaps(gr.signal, bam[strand(bam)=='-'])
} #if
# now read data coming from both mates
bam = readGAlignments(bam.file, param=ScanBamParam(which=gr.signal))
# how many reads align with + strand and - strands
mRets[,'bp'] = countOverlaps(gr.signal, bam[strand(bam)=='+'])
mRets[,'bm'] = countOverlaps(gr.signal, bam[strand(bam)=='-'])
# if data was single end read then first mates will be zero in matrix
if (bPaired == F){ mRets[,'mp'] = mRets[,'bp']
mRets[,'mm'] = mRets[,'bm'] } # if
return(mRets)
} # function
# Function: f_dfReadMACS2xls
# Desc: Simple function to read data from MACS2 xls file output that is tab separated
# Args: xls.file: path to xls file produced by MACS2
# skip.lines: by default MACS2 has 22 lines of comments
# separator: by default it is a tab separated file
# Rets: a data frame with data
f_dfReadMACS2xls = function(xls.file, separator='\t'){
# read the data in MACS2 xls file
# it is a tab separated file with n lines of comments
# the comment lines start with a #, so skip these lines
infile = file(xls.file, 'rt')
input = readLines(infile, n = 1000)
n = grep('^#', x = input)
close(infile)
# we have n lines of comments, so skip n+1 lines
df = read.csv(file=xls.file, header=T, sep=separator, skip=length(n)+1)
return(df)
} # function
# Function: f_oGRMACS2toGRanges
# Desc: Simple function to read data from MACS2 xls file output that is tab separated
# Args: xls.file: path to xls file produced by MACS2
# skip.lines: by default MACS2 has 22 lines of comments
# separator: by default it is a tab separated file
# Rets: a Granges object of MACS2 xls data
f_oGRMACS2toGRanges = function(xls.file, separator='\t'){
# use previously defined function to read data frame
df = f_dfReadMACS2xls(xls.file, separator)
# convert data frame to GRanges object
gr = GRanges(seqnames=as.character(df$chr), ranges=IRanges(df$start, df$end))
dt = DataFrame(abs_summit=df$abs_summit, pileup=df$pileup,
fold_enrichment=df$fold_enrichment, neg.log10.qval=df$X.log10.qvalue.)
mcols(gr) = dt
return(gr)
} # function
# Function: f_WriteGff3
# Desc: Takes a GRanges object, separates it into 3 groups based on strands (*, + or -)
# checks for length of each group, if length is greater than 0
# writes that object into a gff3 format using rtracklayer library
# Args: a granges object to write
# Rets: none
f_WriteGff3 = function(oGR){
library(rtracklayer)
# write 3 gff files for *, + and - strand
if (length(oGR[strand(oGR)=='*']) > 0){
name.out = paste('oGRdismiss_', make.names(date()), '_double_st.gff3', sep='')
export(oGR[strand(oGR)=='*'], name.out, format='gff3')
}
if (length(oGR[strand(oGR)=='+']) > 0){
name.out = paste('oGRdismiss_', make.names(date()), '_plus_st.gff3', sep='')
export(oGR[strand(oGR)=='+'], name.out)
}
if (length(oGR[strand(oGR)=='-']) > 0){
name.out = paste('oGRdismiss_', make.names(date()), '_minus_st.gff3', sep='')
export(oGR[strand(oGR)=='-'], name.out, format='gff3')
}
} # function end
# Function: f_oGRSeparateStrands
# Desc: main function to separate strands, makes use of other previous functions
# Args: oGRsignal.strand = a granges object with ranges for locations of peaks
# csFile = location of bam file, it assumes a bam index file is present
# bPaired = boolean value, set to true for paired end reads or else set to false
# Rets: a Granges object of MACS2 xls data
f_oGRSeparateStrands = function(oGRsignal.strand, csFile, bPaired){
# count reads aligning to regions
mFirstMate = f_mFastCountMateReadsOverRanges(oGRsignal.strand, bam.file = csFile, bPaired)
mp = mFirstMate[,'mp'] + 1 # add a one to avoid zero probabilities and divisions by zero
mm = mFirstMate[,'mm'] + 1
n = mp + mm
p.plus = mp/n
p.minus = 1-p.plus
mFirstMate = cbind(mFirstMate, p.plus, p.minus)
## assign strands to regions
# check if the 2 reads on plus and minus strands are equal
# returns a boolean true if equal else false
t = apply(mFirstMate[,c('mp', 'mm')], 1, f_bComparePoissonLiklihoodApply)
# those regions where t is true are double stranded
# others will be plus stranded if reads on plus side > reads on minus side
t.f = rep(NA, length.out=nrow(mFirstMate))
for (i in 1:length(t)){
if (t[i]){
t.f[i] = 2 # both strands
} else if (mFirstMate[i,'mp'] > mFirstMate[i,'mm']) {
t.f[i] = 1 # else if plus strand gives more first mates
} else t.f[i] = 0 # else minus strand gives more first mates
} #for
st = factor(t.f, labels=c('-', '+', '*'))
# add data to GRanges object
## add data to GRanges object
oGRsignal.strand$strand_fac = t.f
strand(oGRsignal.strand) = st
df = mcols(oGRsignal.strand)
df = cbind(df, DataFrame(mFirstMate))
mcols(oGRsignal.strand) = df
return(oGRsignal.strand)
}