diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 31ad830b..a4ae5436 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -126,12 +126,12 @@ jobs:
matrix:
parameters:
- "--gff false --freyja_depthcutoff 1"
- - "--additional_annotation ./GCA_009858895.3_ASM985889v3_genomic.gtf.gz"
- - "--input false"
+ - "--additional_annotation ./GCA_009858895.3_ASM985889v3_genomic.gtf.gz --freyja_depthcutoff 1"
+ - "--input false --freyja_depthcutoff 1"
- "--min_barcode_reads 10000"
- "--min_guppyplex_reads 10000"
- - "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false"
- - "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --artic_minion_medaka_model ./r941_min_high_g360_model.hdf5"
+ - "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --freyja_depthcutoff 1"
+ - "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --artic_minion_medaka_model ./r941_min_high_g360_model.hdf5 --freyja_depthcutoff 1"
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 480c2c5d..8b2c1bf3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ Special thanks to the following for their code contributions to the release:
- [Adam Talbot](https://github.com/adamrtalbot)
- [Joon Klaps](https://github.com/Joon-Klaps)
+- [Sarai Varona](https://github.com/svarona)
Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form.
@@ -24,18 +25,21 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
- [[PR #413](https://github.com/nf-core/viralrecon/pull/413)] - Update multiqc module & include freyja in report
- [[PR #401](https://github.com/nf-core/viralrecon/pull/401)] - Added option to add a custom annotation
- [[PR #417](https://github.com/nf-core/viralrecon/pull/417)] - Allow skipping of Freyja bootstrapping module & freyja module update
+- [[PR #434](https://github.com/nf-core/viralrecon/pull/434)] - Add blast result filtering through `min_contig_length` and `min_perc_contig_aligned`.
### Parameters
-| Old parameter | New parameter |
-| ------------- | ------------------------- |
-| | `--skip_freyja` |
-| | `--freyja_repeats` |
-| | `--freyja_db_name` |
-| | `--freyja_barcodes` |
-| | `--freyja_lineages` |
-| | `--skip_freyja_boot` |
-| | `--additional_annotation` |
+| Old parameter | New parameter |
+| ------------- | --------------------------- |
+| | `--skip_freyja` |
+| | `--freyja_repeats` |
+| | `--freyja_db_name` |
+| | `--freyja_barcodes` |
+| | `--freyja_lineages` |
+| | `--skip_freyja_boot` |
+| | `--additional_annotation` |
+| | `--min_contig_length` |
+| | `--min_perc_contig_aligned` |
> **NB:** Parameter has been **updated** if both old and new parameter information is present.
> **NB:** Parameter has been **added** if just the new parameter information is present.
diff --git a/assets/headers/blast_filtered_outfmt6_header.txt b/assets/headers/blast_filtered_outfmt6_header.txt
new file mode 100644
index 00000000..e9584e74
--- /dev/null
+++ b/assets/headers/blast_filtered_outfmt6_header.txt
@@ -0,0 +1 @@
+stitle staxids qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs %cgAligned %refCovered
diff --git a/assets/headers/blast_outfmt6_header.txt b/assets/headers/blast_outfmt6_header.txt
index 1bbb1847..57a9bc2d 100644
--- a/assets/headers/blast_outfmt6_header.txt
+++ b/assets/headers/blast_outfmt6_header.txt
@@ -1 +1 @@
-stitle qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs %cgAligned %refCovered
+stitle staxids qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs
diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config
index be19c215..1756cb48 100644
--- a/conf/modules_illumina.config
+++ b/conf/modules_illumina.config
@@ -833,16 +833,15 @@ if (!params.skip_assembly) {
if (!params.skip_blast) {
process {
withName: '.*:.*:ASSEMBLY_SPADES:.*:BLAST_BLASTN' {
- ext.args = "-outfmt '6 stitle std slen qlen qcovs'"
+ ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'"
publishDir = [
path: { "${params.outdir}/assembly/spades/${params.spades_mode}/blastn" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ enabled: false
]
}
withName: '.*:.*:ASSEMBLY_SPADES:.*:FILTER_BLASTN' {
- ext.prefix = { "${meta.id}.filter.blastn" }
+ ext.prefix = { "${meta.id}" }
publishDir = [
path: { "${params.outdir}/assembly/spades/${params.spades_mode}/blastn" },
mode: params.publish_dir_mode,
@@ -939,16 +938,15 @@ if (!params.skip_assembly) {
if (!params.skip_blast) {
process {
withName: '.*:.*:ASSEMBLY_UNICYCLER:.*:BLAST_BLASTN' {
- ext.args = "-outfmt '6 stitle std slen qlen qcovs'"
+ ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'"
publishDir = [
path: { "${params.outdir}/assembly/unicycler/blastn" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ enabled: false
]
}
withName: '.*:.*:ASSEMBLY_UNICYCLER:.*:FILTER_BLASTN' {
- ext.prefix = { "${meta.id}.filter.blastn" }
+ ext.prefix = { "${meta.id}" }
publishDir = [
path: { "${params.outdir}/assembly/unicycler/blastn" },
mode: params.publish_dir_mode,
@@ -1012,16 +1010,15 @@ if (!params.skip_assembly) {
if (!params.skip_blast) {
process {
withName: '.*:.*:ASSEMBLY_MINIA:.*:BLAST_BLASTN' {
- ext.args = "-outfmt '6 stitle std slen qlen qcovs'"
+ ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'"
publishDir = [
path: { "${params.outdir}/assembly/minia/blastn" },
- mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ enabled: false
]
}
withName: '.*:.*:ASSEMBLY_MINIA:.*:FILTER_BLASTN' {
- ext.prefix = { "${meta.id}.filter.blastn" }
+ ext.prefix = { "${meta.id}" }
publishDir = [
path: { "${params.outdir}/assembly/minia/blastn" },
mode: params.publish_dir_mode,
diff --git a/docs/output.md b/docs/output.md
index c946c806..4881ad01 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -909,8 +909,31 @@ In the variant calling branch of the pipeline we are using [iVar trim](#ivar-tri
Output files
- `assembly//blastn/`
- - `*.blastn.txt`: BLAST results against the target virus.
+ - `*results.blastn.txt`: BLAST results against the target virus.
- `*.filter.blastn.txt`: Filtered BLAST results.
+ - Applied filters by default are:
+ - `qlen` (contig length) > 200 nt
+ - `%cgAligned` (percentage of contig aligned) > 0.7 (70%)
+ - Columns description (for more information see: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6):
+ - stitle: Subject Title. Name of the reference genome.
+ - staxids: Subject Taxonomy ID(s), separated by a ';'. When blast databse is no annotated with taxids, 0 will appear.
+ - qaccver: Query accesion version. Contig name.
+ - saccver: Subject accession version. Reference genome accession version.
+ - pident: Percentage of identical matches.
+ - length: Alignment length.
+ - mismatch: Number of mismatches.
+ - gapopen: Number of gap openings.
+ - qstart: Start of alignment in query (sample's contig).
+ - qend: End of alignment in query (sample's contig).
+ - sstart: Start of alignment in subject (reference genome).
+ - send: Start of alignment in subject (reference genome).
+ - evalue: Expect value.
+ - bitscore: Bit score. The bit-score is the requires size of a sequence database in which the current match could be found just by chance. The bit-score is a log2 scaled and normalized raw-score. Each increase by one doubles the required database size (2bit-score).
+ - slen: Subject (reference genome) sequence length.
+ - qlen: Query (contig) sequence length.
+ - qcovs: Query Coverage Per Subject.
+ - %cgAligned: Percentage of contig covered in the alignment. It is calculated dividing `length/qlen`.
+ - %refCovered: Percentage of reference genome covered in the alignment. It is calculated dividing `length/slen`.
**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades').
diff --git a/modules/local/filter_blastn.nf b/modules/local/filter_blastn.nf
index a11a2d97..e87b559b 100644
--- a/modules/local/filter_blastn.nf
+++ b/modules/local/filter_blastn.nf
@@ -10,19 +10,25 @@ process FILTER_BLASTN {
input:
tuple val(meta), path(hits)
path header
+ path filtered_header
output:
- tuple val(meta), path('*.txt'), emit: txt
- path "versions.yml" , emit: versions
+ tuple val(meta), path('*filter.blastn.txt') , emit: txt
+ tuple val(meta), path('*.results.blastn.txt'), emit: blast
+ path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def prefix = task.ext.prefix ?: "${meta.id}"
+ def min_contig_length = params.min_contig_length
+ def min_perc_contig_aligned = params.min_perc_contig_aligned
+
"""
- awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$5/\$15,\$5/\$14}' $hits | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print \$0}' > tmp.out
- cat $header tmp.out > ${prefix}.txt
+ cat $header $hits > ${prefix}.results.blastn.txt
+ awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$6/\$16,\$6/\$15}' $hits | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$16 > ${min_contig_length} && \$18 > ${min_perc_contig_aligned} && \$1 !~ /phage/ {print \$0}' > tmp.out
+ cat $filtered_header tmp.out > ${prefix}.filter.blastn.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/nextflow.config b/nextflow.config
index 15e1d831..7e358c0d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -55,7 +55,7 @@ params {
skip_variants_long_table = false
skip_multiqc = false
-// MultiQC options
+ // MultiQC options
multiqc_config = null
multiqc_title = null
multiqc_logo = null
@@ -94,6 +94,8 @@ params {
spades_mode = 'rnaviral'
spades_hmm = null
blast_db = null
+ min_contig_length = 200
+ min_perc_contig_aligned = 0.7
skip_bandage = false
skip_blast = false
skip_abacas = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index c1ab246b..1ec42feb 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -572,6 +572,18 @@
"type": "boolean",
"fa_icon": "fas fa-fast-forward",
"description": "Specify this parameter to skip all of the de novo assembly steps in the pipeline."
+ },
+ "min_contig_length": {
+ "type": "integer",
+ "default": 200,
+ "fa_icon": "fas fa-sliders-h",
+ "description": "Minimum contig length to filter from BLAST results."
+ },
+ "min_perc_contig_aligned": {
+ "type": "number",
+ "default": 0.7,
+ "fa_icon": "fas fa-sliders-h",
+ "description": "Minimum percentage of contig aligned to filter from BLAST results."
}
},
"fa_icon": "fas fa-random"
diff --git a/subworkflows/local/assembly_minia.nf b/subworkflows/local/assembly_minia.nf
index e32ac5cc..31f1179d 100644
--- a/subworkflows/local/assembly_minia.nf
+++ b/subworkflows/local/assembly_minia.nf
@@ -8,11 +8,12 @@ include { ASSEMBLY_QC } from './assembly_qc'
workflow ASSEMBLY_MINIA {
take:
- reads // channel: [ val(meta), [ reads ] ]
- fasta // channel: /path/to/genome.fasta
- gff // channel: /path/to/genome.gff
- blast_db // channel: /path/to/blast_db/
- blast_header // channel: /path/to/blast_header.txt
+ reads // channel: [ val(meta), [ reads ] ]
+ fasta // channel: /path/to/genome.fasta
+ gff // channel: /path/to/genome.gff
+ blast_db // channel: /path/to/blast_db/
+ blast_header // channel: /path/to/blast_header.txt
+ blast_filtered_header // channel: /path/to/blast_filtered_header.txt
main:
@@ -43,7 +44,8 @@ workflow ASSEMBLY_MINIA {
fasta,
gff,
blast_db,
- blast_header
+ blast_header,
+ blast_filtered_header
)
ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions)
diff --git a/subworkflows/local/assembly_qc.nf b/subworkflows/local/assembly_qc.nf
index ac98c56b..315c8cef 100644
--- a/subworkflows/local/assembly_qc.nf
+++ b/subworkflows/local/assembly_qc.nf
@@ -10,11 +10,12 @@ include { QUAST } from '../../modules/nf-core/quast/main'
workflow ASSEMBLY_QC {
take:
- scaffolds // channel: [ val(meta), [ scaffolds ] ]
- fasta // channel: /path/to/genome.fasta
- gff // channel: /path/to/genome.gff
- blast_db // channel: /path/to/blast_db/
- blast_header // channel: /path/to/blast_header.txt
+ scaffolds // channel: [ val(meta), [ scaffolds ] ]
+ fasta // channel: /path/to/genome.fasta
+ gff // channel: /path/to/genome.gff
+ blast_db // channel: /path/to/blast_db/
+ blast_header // channel: /path/to/blast_header.txt
+ blast_filtered_header // channel: /path/to/blast_filtered_header.txt
main:
@@ -30,13 +31,14 @@ workflow ASSEMBLY_QC {
scaffolds,
blast_db
)
- ch_blast_txt = BLAST_BLASTN.out.txt
ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions.first())
FILTER_BLASTN (
BLAST_BLASTN.out.txt,
- blast_header
+ blast_header,
+ blast_filtered_header
)
+ ch_blast_txt = FILTER_BLASTN.out.blast
ch_blast_filter_txt = FILTER_BLASTN.out.txt
ch_versions = ch_versions.mix(FILTER_BLASTN.out.versions.first())
}
diff --git a/subworkflows/local/assembly_spades.nf b/subworkflows/local/assembly_spades.nf
index 821ce927..ee11801d 100644
--- a/subworkflows/local/assembly_spades.nf
+++ b/subworkflows/local/assembly_spades.nf
@@ -11,13 +11,14 @@ include { ASSEMBLY_QC } from './assembly_qc'
workflow ASSEMBLY_SPADES {
take:
- reads // channel: [ val(meta), [ reads ] ]
- mode // string : spades assembly mode e.g. 'rnaviral'
- hmm // channel: /path/to/spades.hmm
- fasta // channel: /path/to/genome.fasta
- gff // channel: /path/to/genome.gff
- blast_db // channel: /path/to/blast_db/
- blast_header // channel: /path/to/blast_header.txt
+ reads // channel: [ val(meta), [ reads ] ]
+ mode // string : spades assembly mode e.g. 'rnaviral'
+ hmm // channel: /path/to/spades.hmm
+ fasta // channel: /path/to/genome.fasta
+ gff // channel: /path/to/genome.gff
+ blast_db // channel: /path/to/blast_db/
+ blast_header // channel: /path/to/blast_header.txt
+ blast_filtered_header // channel: /path/to/blast_filtered_header.txt
main:
@@ -95,7 +96,8 @@ workflow ASSEMBLY_SPADES {
fasta,
gff,
blast_db,
- blast_header
+ blast_header,
+ blast_filtered_header
)
ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions)
diff --git a/subworkflows/local/assembly_unicycler.nf b/subworkflows/local/assembly_unicycler.nf
index bb6a4d59..d6b0574d 100644
--- a/subworkflows/local/assembly_unicycler.nf
+++ b/subworkflows/local/assembly_unicycler.nf
@@ -11,11 +11,12 @@ include { ASSEMBLY_QC } from './assembly_qc'
workflow ASSEMBLY_UNICYCLER {
take:
- reads // channel: [ val(meta), [ reads ] ]
- fasta // channel: /path/to/genome.fasta
- gff // channel: /path/to/genome.gff
- blast_db // channel: /path/to/blast_db/
- blast_header // channel: /path/to/blast_header.txt
+ reads // channel: [ val(meta), [ reads ] ]
+ fasta // channel: /path/to/genome.fasta
+ gff // channel: /path/to/genome.gff
+ blast_db // channel: /path/to/blast_db/
+ blast_header // channel: /path/to/blast_header.txt
+ blast_filtered_header // channel: /path/to/blast_filtered_header.txt
main:
@@ -81,7 +82,8 @@ workflow ASSEMBLY_UNICYCLER {
fasta,
gff,
blast_db,
- blast_header
+ blast_header,
+ blast_filtered_header
)
ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions)
diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index e43382f2..78bf9d14 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -51,8 +51,9 @@ ch_multiqc_config = file("$projectDir/assets/multiqc_config_illumina.yml"
ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : []
// Header files
-ch_blast_outfmt6_header = file("$projectDir/assets/headers/blast_outfmt6_header.txt", checkIfExists: true)
-ch_ivar_variants_header_mqc = file("$projectDir/assets/headers/ivar_variants_header_mqc.txt", checkIfExists: true)
+ch_blast_outfmt6_header = file("$projectDir/assets/headers/blast_outfmt6_header.txt", checkIfExists: true)
+ch_blast_filtered_outfmt6_header = file("$projectDir/assets/headers/blast_filtered_outfmt6_header.txt", checkIfExists: true)
+ch_ivar_variants_header_mqc = file("$projectDir/assets/headers/ivar_variants_header_mqc.txt", checkIfExists: true)
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -601,7 +602,8 @@ workflow ILLUMINA {
PREPARE_GENOME.out.fasta,
ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ],
PREPARE_GENOME.out.blast_db,
- ch_blast_outfmt6_header
+ ch_blast_outfmt6_header,
+ ch_blast_filtered_outfmt6_header
)
ch_spades_quast_multiqc = ASSEMBLY_SPADES.out.quast_tsv
ch_versions = ch_versions.mix(ASSEMBLY_SPADES.out.versions)
@@ -617,7 +619,8 @@ workflow ILLUMINA {
PREPARE_GENOME.out.fasta,
ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ],
PREPARE_GENOME.out.blast_db,
- ch_blast_outfmt6_header
+ ch_blast_outfmt6_header,
+ ch_blast_filtered_outfmt6_header
)
ch_unicycler_quast_multiqc = ASSEMBLY_UNICYCLER.out.quast_tsv
ch_versions = ch_versions.mix(ASSEMBLY_UNICYCLER.out.versions)
@@ -633,7 +636,8 @@ workflow ILLUMINA {
PREPARE_GENOME.out.fasta,
ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ],
PREPARE_GENOME.out.blast_db,
- ch_blast_outfmt6_header
+ ch_blast_outfmt6_header,
+ ch_blast_filtered_outfmt6_header
)
ch_minia_quast_multiqc = ASSEMBLY_MINIA.out.quast_tsv
ch_versions = ch_versions.mix(ASSEMBLY_MINIA.out.versions)