Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dsl2 add sharding of fastqs before alignment #1023

Merged
merged 15 commits into from
Nov 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v3.0.0dev - [date]
## v3.0.0dev - [2023-08-25]

### `Added`

- [#1006](https://github.com/nf-core/eager/issues/1006) Added feature to shard fastqs before mapping, allowing more flexibility in parallelisation of mapping.

### `Fixed`

### `Dependencies`
Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@

> Broad Institute (2019). Picard Toolkit. GitHub Repository: https://broadinstitute.github.io/picard/

- [SeqKit](https://bioinf.shenwei.me/seqkit/)

> Shen, W., Le, S., Li, Y., & Hu, F. (2016). SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE, 11(10), e0163962. doi:[10.1371/journal.pone.0163962](https://doi.org/10.1371/journal.pone.0163962)

- [bwa](https://doi.org/10.1093/bioinformatics/btp324)

> Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. doi: [10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324)
Expand Down
13 changes: 13 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,18 @@ process {
]
}

//
// SHARDING FASTQS
//
withName: SEQKIT_SPLIT2 {
tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
ext.prefix = "out"
ext.args = "-s ${params.fastq_shard_size}"
publishDir = [
enabled: false
]
}

//
// READ MAPPING
//
Expand Down Expand Up @@ -454,6 +466,7 @@ process {
publishDir = [
enabled: false
]
ext.args = { params.run_fastq_sharding ? "-c -p" : "" }
}

withName: SAMTOOLS_SORT_MERGED_LANES {
Expand Down
4 changes: 4 additions & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ params {
// Genome references
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta'

// Sharding FASTQ
run_fastq_sharding = true
fastq_shard_size = 5000

// BAM filtering
run_bamfiltering = true
bamfiltering_minreadlength = 30
Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules", "bam_split_by_region"]
},
"seqkit/split2": {
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules"]
}
}
},
Expand Down
53 changes: 53 additions & 0 deletions modules/nf-core/seqkit/split2/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions modules/nf-core/seqkit/split2/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ params {
max_multiqc_email_size = '25.MB'
multiqc_methods_description = null

// Shard Fastq options
run_fastq_sharding = false
fastq_shard_size = 1000000

// bedtools options
run_bedtools_coverage = false
mapstats_bedtools_featurefile = null
Expand Down
28 changes: 19 additions & 9 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,19 @@
"description": "Options for aligning reads against reference genome(s)",
"default": "",
"properties": {
"run_fastq_sharding": {
"type": "boolean",
"description": "Turn on FastQ sharding.",
"fa_icon": "fas fa-power-off",
"help_text": "Sharding will split the FastQs into smaller chunks before mapping. These chunks are then mapped in parallel. This approach can speed up the mapping process for larger FastQ files."
},
"fastq_shard_size": {
"type": "integer",
"default": 1000000,
"description": "Specify the number of reads in each shard when splitting.",
"fa_icon": "fas fa-arrows-alt-v",
"help_text": "Make sure to choose a value that makes sense for your dataset. Small values can create many files, which can end up negatively affecting the overall speed of the mapping process."
},
"mapping_tool": {
"type": "string",
"default": "bowtie2",
Expand Down Expand Up @@ -1118,9 +1131,6 @@
{
"$ref": "#/definitions/mapping"
},
{
"$ref": "#/definitions/adna_damage_analysis"
},
{
"$ref": "#/definitions/bam_filtering"
},
Expand All @@ -1131,25 +1141,25 @@
"$ref": "#/definitions/deduplication"
},
{
"$ref": "#/definitions/mitochondrial_to_nuclear_ratio"
"$ref": "#/definitions/damage_manipulation"
},
{
"$ref": "#/definitions/mapping_statistics"
"$ref": "#/definitions/genotyping"
},
{
"$ref": "#/definitions/damage_manipulation"
"$ref": "#/definitions/mitochondrial_to_nuclear_ratio"
},
{
"$ref": "#/definitions/genotyping"
"$ref": "#/definitions/mapping_statistics"
},
{
"$ref": "#/definitions/adna_damage_analysis"
},
{
"$ref": "#/definitions/contamination_estimation"
"$ref": "#/definitions/host_removal"
},
{
"$ref": "#/definitions/host_removal"
"$ref": "#/definitions/contamination_estimation"
},
{
"$ref": "#/definitions/feature_annotation_statistics"
Expand Down
45 changes: 43 additions & 2 deletions subworkflows/local/map.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Prepare reference indexing for downstream
//

include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main'
include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main'
include { BWA_MEM } from '../../modules/nf-core/bwa/mem/main'
include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main'
Expand All @@ -19,6 +20,47 @@ workflow MAP {
ch_versions = Channel.empty()
ch_multiqc_files = Channel.empty()

if ( params.run_fastq_sharding ) {

ch_input_for_sharding = reads

SEQKIT_SPLIT2( ch_input_for_sharding )
ch_versions = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() )

sharded_reads = SEQKIT_SPLIT2.out.reads
.transpose()
.map {
meta, reads ->
new_meta = meta.clone()
new_meta.shard_number = reads.getName().replaceAll(/.*(part_\d+).(?:fastq|fq).gz/, '$1')
[ new_meta, reads ]
}
.groupTuple()
TCLamnidis marked this conversation as resolved.
Show resolved Hide resolved

ch_input_for_mapping = sharded_reads
.combine(index)
.multiMap {
meta, reads, meta2, index ->
new_meta = meta.clone()
new_meta.reference = meta2.id
reads: [ new_meta, reads ]
index: [ meta2, index ]
}

} else {

ch_input_for_mapping = reads
.combine(index)
.multiMap {
meta, reads, meta2, index ->
new_meta = meta.clone()
new_meta.reference = meta2.id
reads: [ new_meta, reads ]
index: [ meta2, index ]
}

}

if ( params.mapping_tool == 'bwaaln' ) {
ch_index_for_mapping = index
ch_reads_for_mapping = reads
Expand Down Expand Up @@ -76,8 +118,7 @@ workflow MAP {
ch_input_for_lane_merge = ch_mapped_lane_bam
.map {
meta, bam ->
new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry'] }

new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry', 'shard_number'] }
[ new_meta, bam ]
}
.groupTuple()
Expand Down