Skip to content

Commit

Permalink
Merge pull request nf-core#434 from BU-ISCIII/blast
Browse files Browse the repository at this point in the history
Enhanced BLAST results filtering
  • Loading branch information
svarona authored Jun 18, 2024
2 parents 8161eb9 + a4f6946 commit ae67ea4
Show file tree
Hide file tree
Showing 14 changed files with 121 additions and 64 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,12 @@ jobs:
matrix:
parameters:
- "--gff false --freyja_depthcutoff 1"
- "--additional_annotation ./GCA_009858895.3_ASM985889v3_genomic.gtf.gz"
- "--input false"
- "--additional_annotation ./GCA_009858895.3_ASM985889v3_genomic.gtf.gz --freyja_depthcutoff 1"
- "--input false --freyja_depthcutoff 1"
- "--min_barcode_reads 10000"
- "--min_guppyplex_reads 10000"
- "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false"
- "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --artic_minion_medaka_model ./r941_min_high_g360_model.hdf5"
- "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --freyja_depthcutoff 1"
- "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --artic_minion_medaka_model ./r941_min_high_g360_model.hdf5 --freyja_depthcutoff 1"
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
Expand Down
22 changes: 13 additions & 9 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Special thanks to the following for their code contributions to the release:

- [Adam Talbot](https://github.com/adamrtalbot)
- [Joon Klaps](https://github.com/Joon-Klaps)
- [Sarai Varona](https://github.com/svarona)

Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form.

Expand All @@ -24,18 +25,21 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
- [[PR #413](https://github.com/nf-core/viralrecon/pull/413)] - Update multiqc module & include freyja in report
- [[PR #401](https://github.com/nf-core/viralrecon/pull/401)] - Added option to add a custom annotation
- [[PR #417](https://github.com/nf-core/viralrecon/pull/417)] - Allow skipping of Freyja bootstrapping module & freyja module update
- [[PR #434](https://github.com/nf-core/viralrecon/pull/434)] - Add blast result filtering through `min_contig_length` and `min_perc_contig_aligned`.

### Parameters

| Old parameter | New parameter |
| ------------- | ------------------------- |
| | `--skip_freyja` |
| | `--freyja_repeats` |
| | `--freyja_db_name` |
| | `--freyja_barcodes` |
| | `--freyja_lineages` |
| | `--skip_freyja_boot` |
| | `--additional_annotation` |
| Old parameter | New parameter |
| ------------- | --------------------------- |
| | `--skip_freyja` |
| | `--freyja_repeats` |
| | `--freyja_db_name` |
| | `--freyja_barcodes` |
| | `--freyja_lineages` |
| | `--skip_freyja_boot` |
| | `--additional_annotation` |
| | `--min_contig_length` |
| | `--min_perc_contig_aligned` |

> **NB:** Parameter has been **updated** if both old and new parameter information is present.
> **NB:** Parameter has been **added** if just the new parameter information is present.
Expand Down
1 change: 1 addition & 0 deletions assets/headers/blast_filtered_outfmt6_header.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
stitle staxids qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs %cgAligned %refCovered
2 changes: 1 addition & 1 deletion assets/headers/blast_outfmt6_header.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
stitle qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs %cgAligned %refCovered
stitle staxids qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs
21 changes: 9 additions & 12 deletions conf/modules_illumina.config
Original file line number Diff line number Diff line change
Expand Up @@ -833,16 +833,15 @@ if (!params.skip_assembly) {
if (!params.skip_blast) {
process {
withName: '.*:.*:ASSEMBLY_SPADES:.*:BLAST_BLASTN' {
ext.args = "-outfmt '6 stitle std slen qlen qcovs'"
ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'"
publishDir = [
path: { "${params.outdir}/assembly/spades/${params.spades_mode}/blastn" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
enabled: false
]
}

withName: '.*:.*:ASSEMBLY_SPADES:.*:FILTER_BLASTN' {
ext.prefix = { "${meta.id}.filter.blastn" }
ext.prefix = { "${meta.id}" }
publishDir = [
path: { "${params.outdir}/assembly/spades/${params.spades_mode}/blastn" },
mode: params.publish_dir_mode,
Expand Down Expand Up @@ -939,16 +938,15 @@ if (!params.skip_assembly) {
if (!params.skip_blast) {
process {
withName: '.*:.*:ASSEMBLY_UNICYCLER:.*:BLAST_BLASTN' {
ext.args = "-outfmt '6 stitle std slen qlen qcovs'"
ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'"
publishDir = [
path: { "${params.outdir}/assembly/unicycler/blastn" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
enabled: false
]
}

withName: '.*:.*:ASSEMBLY_UNICYCLER:.*:FILTER_BLASTN' {
ext.prefix = { "${meta.id}.filter.blastn" }
ext.prefix = { "${meta.id}" }
publishDir = [
path: { "${params.outdir}/assembly/unicycler/blastn" },
mode: params.publish_dir_mode,
Expand Down Expand Up @@ -1012,16 +1010,15 @@ if (!params.skip_assembly) {
if (!params.skip_blast) {
process {
withName: '.*:.*:ASSEMBLY_MINIA:.*:BLAST_BLASTN' {
ext.args = "-outfmt '6 stitle std slen qlen qcovs'"
ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'"
publishDir = [
path: { "${params.outdir}/assembly/minia/blastn" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
enabled: false
]
}

withName: '.*:.*:ASSEMBLY_MINIA:.*:FILTER_BLASTN' {
ext.prefix = { "${meta.id}.filter.blastn" }
ext.prefix = { "${meta.id}" }
publishDir = [
path: { "${params.outdir}/assembly/minia/blastn" },
mode: params.publish_dir_mode,
Expand Down
25 changes: 24 additions & 1 deletion docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -909,8 +909,31 @@ In the variant calling branch of the pipeline we are using [iVar trim](#ivar-tri
<summary>Output files</summary>

- `assembly/<ASSEMBLER>/blastn/`
- `*.blastn.txt`: BLAST results against the target virus.
- `*results.blastn.txt`: BLAST results against the target virus.
- `*.filter.blastn.txt`: Filtered BLAST results.
- Applied filters by default are:
- `qlen` (contig length) > 200 nt
- `%cgAligned` (percentage of contig aligned) > 0.7 (70%)
- Columns description (for more information see: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6):
- stitle: Subject Title. Name of the reference genome.
- staxids: Subject Taxonomy ID(s), separated by a ';'. When blast databse is no annotated with taxids, 0 will appear.
- qaccver: Query accesion version. Contig name.
- saccver: Subject accession version. Reference genome accession version.
- pident: Percentage of identical matches.
- length: Alignment length.
- mismatch: Number of mismatches.
- gapopen: Number of gap openings.
- qstart: Start of alignment in query (sample's contig).
- qend: End of alignment in query (sample's contig).
- sstart: Start of alignment in subject (reference genome).
- send: Start of alignment in subject (reference genome).
- evalue: Expect value.
- bitscore: Bit score. The bit-score is the requires size of a sequence database in which the current match could be found just by chance. The bit-score is a log2 scaled and normalized raw-score. Each increase by one doubles the required database size (2bit-score).
- slen: Subject (reference genome) sequence length.
- qlen: Query (contig) sequence length.
- qcovs: Query Coverage Per Subject.
- %cgAligned: Percentage of contig covered in the alignment. It is calculated dividing `length/qlen`.
- %refCovered: Percentage of reference genome covered in the alignment. It is calculated dividing `length/slen`.

**NB:** The value of `<ASSEMBLER>` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades').

Expand Down
14 changes: 10 additions & 4 deletions modules/local/filter_blastn.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,25 @@ process FILTER_BLASTN {
input:
tuple val(meta), path(hits)
path header
path filtered_header

output:
tuple val(meta), path('*.txt'), emit: txt
path "versions.yml" , emit: versions
tuple val(meta), path('*filter.blastn.txt') , emit: txt
tuple val(meta), path('*.results.blastn.txt'), emit: blast
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def prefix = task.ext.prefix ?: "${meta.id}"
def min_contig_length = params.min_contig_length
def min_perc_contig_aligned = params.min_perc_contig_aligned

"""
awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$5/\$15,\$5/\$14}' $hits | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print \$0}' > tmp.out
cat $header tmp.out > ${prefix}.txt
cat $header $hits > ${prefix}.results.blastn.txt
awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$6/\$16,\$6/\$15}' $hits | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$16 > ${min_contig_length} && \$18 > ${min_perc_contig_aligned} && \$1 !~ /phage/ {print \$0}' > tmp.out
cat $filtered_header tmp.out > ${prefix}.filter.blastn.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 3 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ params {
skip_variants_long_table = false
skip_multiqc = false

// MultiQC options
// MultiQC options
multiqc_config = null
multiqc_title = null
multiqc_logo = null
Expand Down Expand Up @@ -94,6 +94,8 @@ params {
spades_mode = 'rnaviral'
spades_hmm = null
blast_db = null
min_contig_length = 200
min_perc_contig_aligned = 0.7
skip_bandage = false
skip_blast = false
skip_abacas = false
Expand Down
12 changes: 12 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,18 @@
"type": "boolean",
"fa_icon": "fas fa-fast-forward",
"description": "Specify this parameter to skip all of the de novo assembly steps in the pipeline."
},
"min_contig_length": {
"type": "integer",
"default": 200,
"fa_icon": "fas fa-sliders-h",
"description": "Minimum contig length to filter from BLAST results."
},
"min_perc_contig_aligned": {
"type": "number",
"default": 0.7,
"fa_icon": "fas fa-sliders-h",
"description": "Minimum percentage of contig aligned to filter from BLAST results."
}
},
"fa_icon": "fas fa-random"
Expand Down
14 changes: 8 additions & 6 deletions subworkflows/local/assembly_minia.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ include { ASSEMBLY_QC } from './assembly_qc'

workflow ASSEMBLY_MINIA {
take:
reads // channel: [ val(meta), [ reads ] ]
fasta // channel: /path/to/genome.fasta
gff // channel: /path/to/genome.gff
blast_db // channel: /path/to/blast_db/
blast_header // channel: /path/to/blast_header.txt
reads // channel: [ val(meta), [ reads ] ]
fasta // channel: /path/to/genome.fasta
gff // channel: /path/to/genome.gff
blast_db // channel: /path/to/blast_db/
blast_header // channel: /path/to/blast_header.txt
blast_filtered_header // channel: /path/to/blast_filtered_header.txt

main:

Expand Down Expand Up @@ -43,7 +44,8 @@ workflow ASSEMBLY_MINIA {
fasta,
gff,
blast_db,
blast_header
blast_header,
blast_filtered_header
)
ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions)

Expand Down
16 changes: 9 additions & 7 deletions subworkflows/local/assembly_qc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ include { QUAST } from '../../modules/nf-core/quast/main'

workflow ASSEMBLY_QC {
take:
scaffolds // channel: [ val(meta), [ scaffolds ] ]
fasta // channel: /path/to/genome.fasta
gff // channel: /path/to/genome.gff
blast_db // channel: /path/to/blast_db/
blast_header // channel: /path/to/blast_header.txt
scaffolds // channel: [ val(meta), [ scaffolds ] ]
fasta // channel: /path/to/genome.fasta
gff // channel: /path/to/genome.gff
blast_db // channel: /path/to/blast_db/
blast_header // channel: /path/to/blast_header.txt
blast_filtered_header // channel: /path/to/blast_filtered_header.txt

main:

Expand All @@ -30,13 +31,14 @@ workflow ASSEMBLY_QC {
scaffolds,
blast_db
)
ch_blast_txt = BLAST_BLASTN.out.txt
ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions.first())

FILTER_BLASTN (
BLAST_BLASTN.out.txt,
blast_header
blast_header,
blast_filtered_header
)
ch_blast_txt = FILTER_BLASTN.out.blast
ch_blast_filter_txt = FILTER_BLASTN.out.txt
ch_versions = ch_versions.mix(FILTER_BLASTN.out.versions.first())
}
Expand Down
18 changes: 10 additions & 8 deletions subworkflows/local/assembly_spades.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ include { ASSEMBLY_QC } from './assembly_qc'

workflow ASSEMBLY_SPADES {
take:
reads // channel: [ val(meta), [ reads ] ]
mode // string : spades assembly mode e.g. 'rnaviral'
hmm // channel: /path/to/spades.hmm
fasta // channel: /path/to/genome.fasta
gff // channel: /path/to/genome.gff
blast_db // channel: /path/to/blast_db/
blast_header // channel: /path/to/blast_header.txt
reads // channel: [ val(meta), [ reads ] ]
mode // string : spades assembly mode e.g. 'rnaviral'
hmm // channel: /path/to/spades.hmm
fasta // channel: /path/to/genome.fasta
gff // channel: /path/to/genome.gff
blast_db // channel: /path/to/blast_db/
blast_header // channel: /path/to/blast_header.txt
blast_filtered_header // channel: /path/to/blast_filtered_header.txt

main:

Expand Down Expand Up @@ -95,7 +96,8 @@ workflow ASSEMBLY_SPADES {
fasta,
gff,
blast_db,
blast_header
blast_header,
blast_filtered_header
)
ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions)

Expand Down
14 changes: 8 additions & 6 deletions subworkflows/local/assembly_unicycler.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ include { ASSEMBLY_QC } from './assembly_qc'

workflow ASSEMBLY_UNICYCLER {
take:
reads // channel: [ val(meta), [ reads ] ]
fasta // channel: /path/to/genome.fasta
gff // channel: /path/to/genome.gff
blast_db // channel: /path/to/blast_db/
blast_header // channel: /path/to/blast_header.txt
reads // channel: [ val(meta), [ reads ] ]
fasta // channel: /path/to/genome.fasta
gff // channel: /path/to/genome.gff
blast_db // channel: /path/to/blast_db/
blast_header // channel: /path/to/blast_header.txt
blast_filtered_header // channel: /path/to/blast_filtered_header.txt

main:

Expand Down Expand Up @@ -81,7 +82,8 @@ workflow ASSEMBLY_UNICYCLER {
fasta,
gff,
blast_db,
blast_header
blast_header,
blast_filtered_header
)
ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions)

Expand Down
14 changes: 9 additions & 5 deletions workflows/illumina.nf
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ ch_multiqc_config = file("$projectDir/assets/multiqc_config_illumina.yml"
ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : []

// Header files
ch_blast_outfmt6_header = file("$projectDir/assets/headers/blast_outfmt6_header.txt", checkIfExists: true)
ch_ivar_variants_header_mqc = file("$projectDir/assets/headers/ivar_variants_header_mqc.txt", checkIfExists: true)
ch_blast_outfmt6_header = file("$projectDir/assets/headers/blast_outfmt6_header.txt", checkIfExists: true)
ch_blast_filtered_outfmt6_header = file("$projectDir/assets/headers/blast_filtered_outfmt6_header.txt", checkIfExists: true)
ch_ivar_variants_header_mqc = file("$projectDir/assets/headers/ivar_variants_header_mqc.txt", checkIfExists: true)

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -601,7 +602,8 @@ workflow ILLUMINA {
PREPARE_GENOME.out.fasta,
ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ],
PREPARE_GENOME.out.blast_db,
ch_blast_outfmt6_header
ch_blast_outfmt6_header,
ch_blast_filtered_outfmt6_header
)
ch_spades_quast_multiqc = ASSEMBLY_SPADES.out.quast_tsv
ch_versions = ch_versions.mix(ASSEMBLY_SPADES.out.versions)
Expand All @@ -617,7 +619,8 @@ workflow ILLUMINA {
PREPARE_GENOME.out.fasta,
ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ],
PREPARE_GENOME.out.blast_db,
ch_blast_outfmt6_header
ch_blast_outfmt6_header,
ch_blast_filtered_outfmt6_header
)
ch_unicycler_quast_multiqc = ASSEMBLY_UNICYCLER.out.quast_tsv
ch_versions = ch_versions.mix(ASSEMBLY_UNICYCLER.out.versions)
Expand All @@ -633,7 +636,8 @@ workflow ILLUMINA {
PREPARE_GENOME.out.fasta,
ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ],
PREPARE_GENOME.out.blast_db,
ch_blast_outfmt6_header
ch_blast_outfmt6_header,
ch_blast_filtered_outfmt6_header
)
ch_minia_quast_multiqc = ASSEMBLY_MINIA.out.quast_tsv
ch_versions = ch_versions.mix(ASSEMBLY_MINIA.out.versions)
Expand Down

0 comments on commit ae67ea4

Please sign in to comment.