diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 31ad830b..a4ae5436 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -126,12 +126,12 @@ jobs: matrix: parameters: - "--gff false --freyja_depthcutoff 1" - - "--additional_annotation ./GCA_009858895.3_ASM985889v3_genomic.gtf.gz" - - "--input false" + - "--additional_annotation ./GCA_009858895.3_ASM985889v3_genomic.gtf.gz --freyja_depthcutoff 1" + - "--input false --freyja_depthcutoff 1" - "--min_barcode_reads 10000" - "--min_guppyplex_reads 10000" - - "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false" - - "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --artic_minion_medaka_model ./r941_min_high_g360_model.hdf5" + - "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --freyja_depthcutoff 1" + - "--artic_minion_caller medaka --sequencing_summary false --fast5_dir false --artic_minion_medaka_model ./r941_min_high_g360_model.hdf5 --freyja_depthcutoff 1" steps: - name: Check out pipeline code uses: actions/checkout@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 480c2c5d..8b2c1bf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Special thanks to the following for their code contributions to the release: - [Adam Talbot](https://github.com/adamrtalbot) - [Joon Klaps](https://github.com/Joon-Klaps) +- [Sarai Varona](https://github.com/svarona) Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. @@ -24,18 +25,21 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [[PR #413](https://github.com/nf-core/viralrecon/pull/413)] - Update multiqc module & include freyja in report - [[PR #401](https://github.com/nf-core/viralrecon/pull/401)] - Added option to add a custom annotation - [[PR #417](https://github.com/nf-core/viralrecon/pull/417)] - Allow skipping of Freyja bootstrapping module & freyja module update +- [[PR #434](https://github.com/nf-core/viralrecon/pull/434)] - Add blast result filtering through `min_contig_length` and `min_perc_contig_aligned`. ### Parameters -| Old parameter | New parameter | -| ------------- | ------------------------- | -| | `--skip_freyja` | -| | `--freyja_repeats` | -| | `--freyja_db_name` | -| | `--freyja_barcodes` | -| | `--freyja_lineages` | -| | `--skip_freyja_boot` | -| | `--additional_annotation` | +| Old parameter | New parameter | +| ------------- | --------------------------- | +| | `--skip_freyja` | +| | `--freyja_repeats` | +| | `--freyja_db_name` | +| | `--freyja_barcodes` | +| | `--freyja_lineages` | +| | `--skip_freyja_boot` | +| | `--additional_annotation` | +| | `--min_contig_length` | +| | `--min_perc_contig_aligned` | > **NB:** Parameter has been **updated** if both old and new parameter information is present. > **NB:** Parameter has been **added** if just the new parameter information is present. diff --git a/assets/headers/blast_filtered_outfmt6_header.txt b/assets/headers/blast_filtered_outfmt6_header.txt new file mode 100644 index 00000000..e9584e74 --- /dev/null +++ b/assets/headers/blast_filtered_outfmt6_header.txt @@ -0,0 +1 @@ +stitle staxids qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs %cgAligned %refCovered diff --git a/assets/headers/blast_outfmt6_header.txt b/assets/headers/blast_outfmt6_header.txt index 1bbb1847..57a9bc2d 100644 --- a/assets/headers/blast_outfmt6_header.txt +++ b/assets/headers/blast_outfmt6_header.txt @@ -1 +1 @@ -stitle qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs %cgAligned %refCovered +stitle staxids qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config index be19c215..1756cb48 100644 --- a/conf/modules_illumina.config +++ b/conf/modules_illumina.config @@ -833,16 +833,15 @@ if (!params.skip_assembly) { if (!params.skip_blast) { process { withName: '.*:.*:ASSEMBLY_SPADES:.*:BLAST_BLASTN' { - ext.args = "-outfmt '6 stitle std slen qlen qcovs'" + ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'" publishDir = [ path: { "${params.outdir}/assembly/spades/${params.spades_mode}/blastn" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + enabled: false ] } withName: '.*:.*:ASSEMBLY_SPADES:.*:FILTER_BLASTN' { - ext.prefix = { "${meta.id}.filter.blastn" } + ext.prefix = { "${meta.id}" } publishDir = [ path: { "${params.outdir}/assembly/spades/${params.spades_mode}/blastn" }, mode: params.publish_dir_mode, @@ -939,16 +938,15 @@ if (!params.skip_assembly) { if (!params.skip_blast) { process { withName: '.*:.*:ASSEMBLY_UNICYCLER:.*:BLAST_BLASTN' { - ext.args = "-outfmt '6 stitle std slen qlen qcovs'" + ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'" publishDir = [ path: { "${params.outdir}/assembly/unicycler/blastn" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + enabled: false ] } withName: '.*:.*:ASSEMBLY_UNICYCLER:.*:FILTER_BLASTN' { - ext.prefix = { "${meta.id}.filter.blastn" } + ext.prefix = { "${meta.id}" } publishDir = [ path: { "${params.outdir}/assembly/unicycler/blastn" }, mode: params.publish_dir_mode, @@ -1012,16 +1010,15 @@ if (!params.skip_assembly) { if (!params.skip_blast) { process { withName: '.*:.*:ASSEMBLY_MINIA:.*:BLAST_BLASTN' { - ext.args = "-outfmt '6 stitle std slen qlen qcovs'" + ext.args = "-outfmt '6 stitle staxids std slen qlen qcovs'" publishDir = [ path: { "${params.outdir}/assembly/minia/blastn" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + enabled: false ] } withName: '.*:.*:ASSEMBLY_MINIA:.*:FILTER_BLASTN' { - ext.prefix = { "${meta.id}.filter.blastn" } + ext.prefix = { "${meta.id}" } publishDir = [ path: { "${params.outdir}/assembly/minia/blastn" }, mode: params.publish_dir_mode, diff --git a/docs/output.md b/docs/output.md index c946c806..4881ad01 100644 --- a/docs/output.md +++ b/docs/output.md @@ -909,8 +909,31 @@ In the variant calling branch of the pipeline we are using [iVar trim](#ivar-tri Output files - `assembly//blastn/` - - `*.blastn.txt`: BLAST results against the target virus. + - `*results.blastn.txt`: BLAST results against the target virus. - `*.filter.blastn.txt`: Filtered BLAST results. + - Applied filters by default are: + - `qlen` (contig length) > 200 nt + - `%cgAligned` (percentage of contig aligned) > 0.7 (70%) + - Columns description (for more information see: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6): + - stitle: Subject Title. Name of the reference genome. + - staxids: Subject Taxonomy ID(s), separated by a ';'. When blast databse is no annotated with taxids, 0 will appear. + - qaccver: Query accesion version. Contig name. + - saccver: Subject accession version. Reference genome accession version. + - pident: Percentage of identical matches. + - length: Alignment length. + - mismatch: Number of mismatches. + - gapopen: Number of gap openings. + - qstart: Start of alignment in query (sample's contig). + - qend: End of alignment in query (sample's contig). + - sstart: Start of alignment in subject (reference genome). + - send: Start of alignment in subject (reference genome). + - evalue: Expect value. + - bitscore: Bit score. The bit-score is the requires size of a sequence database in which the current match could be found just by chance. The bit-score is a log2 scaled and normalized raw-score. Each increase by one doubles the required database size (2bit-score). + - slen: Subject (reference genome) sequence length. + - qlen: Query (contig) sequence length. + - qcovs: Query Coverage Per Subject. + - %cgAligned: Percentage of contig covered in the alignment. It is calculated dividing `length/qlen`. + - %refCovered: Percentage of reference genome covered in the alignment. It is calculated dividing `length/slen`. **NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades'). diff --git a/modules/local/filter_blastn.nf b/modules/local/filter_blastn.nf index a11a2d97..e87b559b 100644 --- a/modules/local/filter_blastn.nf +++ b/modules/local/filter_blastn.nf @@ -10,19 +10,25 @@ process FILTER_BLASTN { input: tuple val(meta), path(hits) path header + path filtered_header output: - tuple val(meta), path('*.txt'), emit: txt - path "versions.yml" , emit: versions + tuple val(meta), path('*filter.blastn.txt') , emit: txt + tuple val(meta), path('*.results.blastn.txt'), emit: blast + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: def prefix = task.ext.prefix ?: "${meta.id}" + def min_contig_length = params.min_contig_length + def min_perc_contig_aligned = params.min_perc_contig_aligned + """ - awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$5/\$15,\$5/\$14}' $hits | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print \$0}' > tmp.out - cat $header tmp.out > ${prefix}.txt + cat $header $hits > ${prefix}.results.blastn.txt + awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$6/\$16,\$6/\$15}' $hits | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$16 > ${min_contig_length} && \$18 > ${min_perc_contig_aligned} && \$1 !~ /phage/ {print \$0}' > tmp.out + cat $filtered_header tmp.out > ${prefix}.filter.blastn.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 15e1d831..7e358c0d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,7 +55,7 @@ params { skip_variants_long_table = false skip_multiqc = false -// MultiQC options + // MultiQC options multiqc_config = null multiqc_title = null multiqc_logo = null @@ -94,6 +94,8 @@ params { spades_mode = 'rnaviral' spades_hmm = null blast_db = null + min_contig_length = 200 + min_perc_contig_aligned = 0.7 skip_bandage = false skip_blast = false skip_abacas = false diff --git a/nextflow_schema.json b/nextflow_schema.json index c1ab246b..1ec42feb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -572,6 +572,18 @@ "type": "boolean", "fa_icon": "fas fa-fast-forward", "description": "Specify this parameter to skip all of the de novo assembly steps in the pipeline." + }, + "min_contig_length": { + "type": "integer", + "default": 200, + "fa_icon": "fas fa-sliders-h", + "description": "Minimum contig length to filter from BLAST results." + }, + "min_perc_contig_aligned": { + "type": "number", + "default": 0.7, + "fa_icon": "fas fa-sliders-h", + "description": "Minimum percentage of contig aligned to filter from BLAST results." } }, "fa_icon": "fas fa-random" diff --git a/subworkflows/local/assembly_minia.nf b/subworkflows/local/assembly_minia.nf index e32ac5cc..31f1179d 100644 --- a/subworkflows/local/assembly_minia.nf +++ b/subworkflows/local/assembly_minia.nf @@ -8,11 +8,12 @@ include { ASSEMBLY_QC } from './assembly_qc' workflow ASSEMBLY_MINIA { take: - reads // channel: [ val(meta), [ reads ] ] - fasta // channel: /path/to/genome.fasta - gff // channel: /path/to/genome.gff - blast_db // channel: /path/to/blast_db/ - blast_header // channel: /path/to/blast_header.txt + reads // channel: [ val(meta), [ reads ] ] + fasta // channel: /path/to/genome.fasta + gff // channel: /path/to/genome.gff + blast_db // channel: /path/to/blast_db/ + blast_header // channel: /path/to/blast_header.txt + blast_filtered_header // channel: /path/to/blast_filtered_header.txt main: @@ -43,7 +44,8 @@ workflow ASSEMBLY_MINIA { fasta, gff, blast_db, - blast_header + blast_header, + blast_filtered_header ) ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions) diff --git a/subworkflows/local/assembly_qc.nf b/subworkflows/local/assembly_qc.nf index ac98c56b..315c8cef 100644 --- a/subworkflows/local/assembly_qc.nf +++ b/subworkflows/local/assembly_qc.nf @@ -10,11 +10,12 @@ include { QUAST } from '../../modules/nf-core/quast/main' workflow ASSEMBLY_QC { take: - scaffolds // channel: [ val(meta), [ scaffolds ] ] - fasta // channel: /path/to/genome.fasta - gff // channel: /path/to/genome.gff - blast_db // channel: /path/to/blast_db/ - blast_header // channel: /path/to/blast_header.txt + scaffolds // channel: [ val(meta), [ scaffolds ] ] + fasta // channel: /path/to/genome.fasta + gff // channel: /path/to/genome.gff + blast_db // channel: /path/to/blast_db/ + blast_header // channel: /path/to/blast_header.txt + blast_filtered_header // channel: /path/to/blast_filtered_header.txt main: @@ -30,13 +31,14 @@ workflow ASSEMBLY_QC { scaffolds, blast_db ) - ch_blast_txt = BLAST_BLASTN.out.txt ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions.first()) FILTER_BLASTN ( BLAST_BLASTN.out.txt, - blast_header + blast_header, + blast_filtered_header ) + ch_blast_txt = FILTER_BLASTN.out.blast ch_blast_filter_txt = FILTER_BLASTN.out.txt ch_versions = ch_versions.mix(FILTER_BLASTN.out.versions.first()) } diff --git a/subworkflows/local/assembly_spades.nf b/subworkflows/local/assembly_spades.nf index 821ce927..ee11801d 100644 --- a/subworkflows/local/assembly_spades.nf +++ b/subworkflows/local/assembly_spades.nf @@ -11,13 +11,14 @@ include { ASSEMBLY_QC } from './assembly_qc' workflow ASSEMBLY_SPADES { take: - reads // channel: [ val(meta), [ reads ] ] - mode // string : spades assembly mode e.g. 'rnaviral' - hmm // channel: /path/to/spades.hmm - fasta // channel: /path/to/genome.fasta - gff // channel: /path/to/genome.gff - blast_db // channel: /path/to/blast_db/ - blast_header // channel: /path/to/blast_header.txt + reads // channel: [ val(meta), [ reads ] ] + mode // string : spades assembly mode e.g. 'rnaviral' + hmm // channel: /path/to/spades.hmm + fasta // channel: /path/to/genome.fasta + gff // channel: /path/to/genome.gff + blast_db // channel: /path/to/blast_db/ + blast_header // channel: /path/to/blast_header.txt + blast_filtered_header // channel: /path/to/blast_filtered_header.txt main: @@ -95,7 +96,8 @@ workflow ASSEMBLY_SPADES { fasta, gff, blast_db, - blast_header + blast_header, + blast_filtered_header ) ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions) diff --git a/subworkflows/local/assembly_unicycler.nf b/subworkflows/local/assembly_unicycler.nf index bb6a4d59..d6b0574d 100644 --- a/subworkflows/local/assembly_unicycler.nf +++ b/subworkflows/local/assembly_unicycler.nf @@ -11,11 +11,12 @@ include { ASSEMBLY_QC } from './assembly_qc' workflow ASSEMBLY_UNICYCLER { take: - reads // channel: [ val(meta), [ reads ] ] - fasta // channel: /path/to/genome.fasta - gff // channel: /path/to/genome.gff - blast_db // channel: /path/to/blast_db/ - blast_header // channel: /path/to/blast_header.txt + reads // channel: [ val(meta), [ reads ] ] + fasta // channel: /path/to/genome.fasta + gff // channel: /path/to/genome.gff + blast_db // channel: /path/to/blast_db/ + blast_header // channel: /path/to/blast_header.txt + blast_filtered_header // channel: /path/to/blast_filtered_header.txt main: @@ -81,7 +82,8 @@ workflow ASSEMBLY_UNICYCLER { fasta, gff, blast_db, - blast_header + blast_header, + blast_filtered_header ) ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions) diff --git a/workflows/illumina.nf b/workflows/illumina.nf index e43382f2..78bf9d14 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -51,8 +51,9 @@ ch_multiqc_config = file("$projectDir/assets/multiqc_config_illumina.yml" ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config) : [] // Header files -ch_blast_outfmt6_header = file("$projectDir/assets/headers/blast_outfmt6_header.txt", checkIfExists: true) -ch_ivar_variants_header_mqc = file("$projectDir/assets/headers/ivar_variants_header_mqc.txt", checkIfExists: true) +ch_blast_outfmt6_header = file("$projectDir/assets/headers/blast_outfmt6_header.txt", checkIfExists: true) +ch_blast_filtered_outfmt6_header = file("$projectDir/assets/headers/blast_filtered_outfmt6_header.txt", checkIfExists: true) +ch_ivar_variants_header_mqc = file("$projectDir/assets/headers/ivar_variants_header_mqc.txt", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -601,7 +602,8 @@ workflow ILLUMINA { PREPARE_GENOME.out.fasta, ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ], PREPARE_GENOME.out.blast_db, - ch_blast_outfmt6_header + ch_blast_outfmt6_header, + ch_blast_filtered_outfmt6_header ) ch_spades_quast_multiqc = ASSEMBLY_SPADES.out.quast_tsv ch_versions = ch_versions.mix(ASSEMBLY_SPADES.out.versions) @@ -617,7 +619,8 @@ workflow ILLUMINA { PREPARE_GENOME.out.fasta, ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ], PREPARE_GENOME.out.blast_db, - ch_blast_outfmt6_header + ch_blast_outfmt6_header, + ch_blast_filtered_outfmt6_header ) ch_unicycler_quast_multiqc = ASSEMBLY_UNICYCLER.out.quast_tsv ch_versions = ch_versions.mix(ASSEMBLY_UNICYCLER.out.versions) @@ -633,7 +636,8 @@ workflow ILLUMINA { PREPARE_GENOME.out.fasta, ch_genome_gff ? PREPARE_GENOME.out.gff.map { [ [:], it ] } : [ [:], [] ], PREPARE_GENOME.out.blast_db, - ch_blast_outfmt6_header + ch_blast_outfmt6_header, + ch_blast_filtered_outfmt6_header ) ch_minia_quast_multiqc = ASSEMBLY_MINIA.out.quast_tsv ch_versions = ch_versions.mix(ASSEMBLY_MINIA.out.versions)