-
Notifications
You must be signed in to change notification settings - Fork 5
/
main.nf
198 lines (164 loc) · 8 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
include { GT_GFF3 } from '../../../modules/gallvp/gt/gff3/main'
include { SAMTOOLS_FAIDX } from '../../../modules/gallvp/samtools/faidx/main'
include { GT_GFF3VALIDATOR } from '../../../modules/gallvp/gt/gff3validator/main'
include { GT_STAT } from '../../../modules/gallvp/gt/stat/main'
workflow GFF3_GT_GFF3_GFF3VALIDATOR_STAT {
take:
ch_gff3 // channel: [ val(meta), gff3 ]
ch_fasta // channel: [ val(meta), fasta ]
main:
ch_versions = Channel.empty()
// MODULE: GT_GFF3
GT_GFF3 ( ch_gff3 )
ch_versions = ch_versions.mix(GT_GFF3.out.versions.first())
// MODULE: GT_GFF3VALIDATOR
GT_GFF3VALIDATOR ( GT_GFF3.out.gt_gff3 )
ch_versions = ch_versions.mix(GT_GFF3VALIDATOR.out.versions.first())
// MODULE: SAMTOOLS_FAIDX
SAMTOOLS_FAIDX(
ch_fasta,
[ [], [] ]
)
ch_fai = SAMTOOLS_FAIDX.out.fai
ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions.first())
// FUNCTION: checkGff3FastaCorrespondence
ch_gff3_fai = GT_GFF3VALIDATOR.out.success_log
| join (
GT_GFF3.out.gt_gff3
)
| map { meta, log, gff3 -> [ meta, gff3 ] }
| join (
ch_fai
)
ch_correspondence_status = ch_gff3_fai
| map { meta, gff3, fai ->
checkGff3FastaCorrespondence ( meta, gff3, fai )
}
ch_correspondence_success = ch_correspondence_status
| map { meta, success, error ->
if ( success ) {
[ meta, success ]
}
}
ch_correspondence_error = ch_correspondence_status
| map { meta, success, error ->
if ( error ) {
[ "${meta.id}.error.log", error.join('\n') ]
}
}
| collectFile
| map { error ->
[ [ id: error.baseName.replace('.error', '') ], error ]
}
ch_valid_gff3 = ch_correspondence_success
| join (
ch_gff3_fai
| map { meta, gff3, fai -> [ meta, gff3 ] }
)
| map { meta, log, gff3 -> [ meta, gff3 ] }
ch_log_for_invalid_gff3 = GT_GFF3.out.error_log
| mix (
GT_GFF3VALIDATOR.out.error_log
)
| mix (
ch_correspondence_error
)
// MODULE: GT_STAT
GT_STAT ( ch_valid_gff3 )
ch_gff3_stats = GT_STAT.out.stats
ch_versions = ch_versions.mix(GT_STAT.out.versions.first())
emit:
valid_gff3 = ch_valid_gff3 // channel: [ val(meta), gff3 ]
gff3_stats = ch_gff3_stats // channel: [ val(meta), yml ]
log_for_invalid_gff3 = ch_log_for_invalid_gff3 // channel: [ val(meta), log ]
versions = ch_versions // channel: [ versions.yml ]
}
def checkGff3FastaCorrespondence(meta, gff3File, faiFile) {
// STEP 1
// Check that gff3 has no identifiers that are not in fasta
def gff3Lines = gff3File.readLines().findAll { ! it.startsWith('#') }
def gff3Identifiers = gff3Lines.collect { it.split('\t')[0] }.unique().sort()
def fastaIdentifiers = faiFile.readLines().collect { it.split('\t')[0] }.unique().sort()
def missingIdentifiers = gff3Identifiers.findAll { ! fastaIdentifiers.contains(it) }
if (missingIdentifiers) {
return [
meta,
[], // success log
[
"Failed to validate gff3 file: ${gff3File.name}",
"GFF3 file contains identifiers not present in FASTA:",
"${missingIdentifiers.join('\n')}"
] // error log
]
}
// STEP 2
// Check that there are no coordinates in gff3 that exceed the sequence length in the parent fasta entry
def sequenceLengths = [:]
faiFile.readLines().each { line ->
def parts = line.split('\t')
sequenceLengths[parts[0]] = parts[1]
}
for ( line in gff3Lines ) {
def parts = line.split('\t')
def name = parts[0]
def start = parts[3].toInteger()
def end = parts[4].toInteger()
def seqLength = sequenceLengths[name].toInteger()
if ( start > seqLength ) {
return [
meta,
[], // success log
[
"Failed to validate gff3: ${gff3File.name}",
"Start coordinates exceed sequence length in the GFF3 file:",
"Sequence: $name",
"Sequence length: $seqLength",
"Start: $start"
] // error log
]
}
if ( end > seqLength ) {
// Check if the sequence is defined as a circular region
// Otherwise, fail
def regionLine = gff3Lines.find {
def _parts = it.split('\t')
_parts[0] == "$name" && _parts[2] == 'region'
}
if ( ! regionLine ) {
return [
meta,
[], // success log
[
"Failed to validate gff3: ${gff3File.name}",
"End coordinates exceed sequence length and the sequence attributes are also missing in GFF3 file:",
"Sequence: $name",
"Sequence length: $seqLength",
"End: $end"
] // error log
]
}
def regionAtts = regionLine.split('\t')[8]
def isCircular = regionAtts.contains('circular=true')
// Models on circular molecules are allowed to exceed sequence length
if ( isCircular ) { continue }
return [
meta,
[], // success log
[
"Failed to validate gff3: ${gff3File.name}",
"End coordinates exceed length of a non-circular sequence in GFF3 file:",
"Sequence: $name",
"Sequence length: $seqLength",
"End: $end"
] // error log
]
}
}
return [
meta,
[
"All tests passed..."
], // success log
[] // error log
]
}