nf-core · shyama-mama · Nov 11, 2023 · Jul 3, 2023 · Aug 25, 2023 · Aug 25, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,10 +3,12 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v3.0.0dev - [date]
+## v3.0.0dev - [2023-08-25]
 
 ### `Added`
 
+- [#1006](https://github.com/nf-core/eager/issues/1006) Added feature to shard fastqs before mapping, allowing more flexibility in parallelisation of mapping. 
+
 ### `Fixed`
 
 ### `Dependencies`

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -34,6 +34,9 @@
 
   > Broad Institute (2019). Picard Toolkit. GitHub Repository: https://broadinstitute.github.io/picard/
 
+- [SeqKit](https://bioinf.shenwei.me/seqkit/)  
+  > Shen, W., Le, S., Li, Y., & Hu, F. (2016). SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE, 11(10), e0163962. doi:[10.1371/journal.pone.0163962](https://doi.org/10.1371/journal.pone.0163962)
+
 - [bwa](https://doi.org/10.1093/bioinformatics/btp324)
 
   > Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. doi: [10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324)

diff --git a/conf/modules.config b/conf/modules.config
@@ -362,6 +362,18 @@ process {
         ]
     }
 
+    //
+    // SHARDING FASTQS
+    //
+    withName: SEQKIT_SPLIT2 {
+        tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
+        ext.prefix = "out"
+        ext.args = "-s ${params.shard_size}"
+        publishDir = [
+            enabled: false
+        ]
+    }
+
     //
     // READ MAPPING
     //
@@ -445,6 +457,7 @@ process {
         publishDir = [
             enabled: false
         ]
+        ext.args = { params.shard_fastq ? "-c -p" : "" }
     }
 
     withName: SAMTOOLS_SORT_MERGED_LANES {

diff --git a/conf/test.config b/conf/test.config
@@ -27,6 +27,10 @@ params {
     // Genome references
     fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta'
 
+    // Sharding FASTQ
+    shard_fastq                           = true
+    shard_size                             = 5000
+
     // BAM filtering
     run_bamfiltering                      = true
     bamfiltering_minreadlength            = 30

diff --git a/modules.json b/modules.json
@@ -199,6 +199,11 @@
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules", "bam_split_by_region"]
+                    },
+                    "seqkit/split2": {
+                        "branch": "master",
+                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "installed_by": ["modules"]
                     }
                 }
             },

diff --git a/modules/nf-core/seqkit/split2/main.nf b/modules/nf-core/seqkit/split2/main.nf
diff --git a/modules/nf-core/seqkit/split2/meta.yml b/modules/nf-core/seqkit/split2/meta.yml
diff --git a/nextflow.config b/nextflow.config
@@ -32,6 +32,10 @@ params {
     max_multiqc_email_size     = '25.MB'
     multiqc_methods_description = null
 
+    // Shard Fastq options
+    shard_fastq                = false
+    shard_size                 = 1000000
+
     // bedtools options
     run_bedtools_coverage         = false
     mapstats_bedtools_featurefile = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -476,6 +476,19 @@
             "description": "Options for aligning reads against reference genome(s)",
             "default": "",
             "properties": {
+                "shard_fastq": {
+                    "type": "boolean",
+                    "description": "Turn on sharding to split reads into smaller chunks before mapping",
+                    "fa_icon": "fas fa-power-off",
+                    "help_text": "Sharding reads before mapping can help parallelise mapping for very large FASTQs.  "
+                },
+                "shard_size": {
+                    "type": "integer",
+                    "default": 1000000,
+                    "description": "Specify the number of reads in each shard when splitting.",
+                    "fa_icon": "fas fa-arrows-alt-v",
+                    "help_text": "Make sure to choose a value that makes sense for your dataset. Small values can create many files. Needs `--shard_fastq`"
+                },
                 "mapping_tool": {
                     "type": "string",
                     "default": "bowtie2",
@@ -1123,9 +1136,6 @@
         {
             "$ref": "#/definitions/mapping"
         },
-        {
-            "$ref": "#/definitions/adna_damage_analysis"
-        },
         {
             "$ref": "#/definitions/bam_filtering"
         },
@@ -1136,25 +1146,25 @@
             "$ref": "#/definitions/deduplication"
         },
         {
-            "$ref": "#/definitions/mitochondrial_to_nuclear_ratio"
+            "$ref": "#/definitions/damage_manipulation"
         },
         {
-            "$ref": "#/definitions/mapping_statistics"
+            "$ref": "#/definitions/genotyping"
         },
         {
-            "$ref": "#/definitions/damage_manipulation"
+            "$ref": "#/definitions/mitochondrial_to_nuclear_ratio"
         },
         {
-            "$ref": "#/definitions/genotyping"
+            "$ref": "#/definitions/mapping_statistics"
         },
         {
             "$ref": "#/definitions/adna_damage_analysis"
         },
         {
-            "$ref": "#/definitions/contamination_estimation"
+            "$ref": "#/definitions/host_removal"
         },
         {
-            "$ref": "#/definitions/host_removal"
+            "$ref": "#/definitions/contamination_estimation"
         },
         {
             "$ref": "#/definitions/feature_annotation_statistics"

diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf
@@ -2,10 +2,11 @@
 // Prepare reference indexing for downstream
 //
 
+include { SEQKIT_SPLIT2                                                                                                       } from '../../modules/nf-core/seqkit/split2/main'
 include { FASTQ_ALIGN_BWAALN                                                                                                  } from '../../subworkflows/nf-core/fastq_align_bwaaln/main'
 include { BWA_MEM                                                                                                             } from '../../modules/nf-core/bwa/mem/main'
 include { BOWTIE2_ALIGN                                                                                                       } from '../../modules/nf-core/bowtie2/align/main'
-include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LANES                                                                              } from '../../modules/nf-core/samtools/merge/main'
+include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LANES ; SAMTOOLS_MERGE as SAMTOOLS_MERGE_SHARDS                                    } from '../../modules/nf-core/samtools/merge/main'
 include { SAMTOOLS_SORT  as SAMTOOLS_SORT_MERGED_LANES                                                                        } from '../../modules/nf-core/samtools/sort/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MEM; SAMTOOLS_INDEX as SAMTOOLS_INDEX_BT2; SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LANES } from '../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MAPPED                                                                       } from '../../modules/nf-core/samtools/flagstat/main'
@@ -19,7 +20,28 @@ workflow MAP {
     ch_versions       = Channel.empty()
     ch_multiqc_files  = Channel.empty()
 
-    ch_input_for_mapping = reads
+    if ( params.shard_fastq ) {
+
+        ch_input_for_sharding = reads
+
+        SEQKIT_SPLIT2( ch_input_for_sharding )
+        ch_versions        = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() )
+
+        ch_input_for_mapping = SEQKIT_SPLIT2.out.reads
+            .transpose()
+            .combine(index)
+            .multiMap {
+                meta, reads, meta2, index ->
+                    new_meta = meta.clone()
+                    new_meta.shard_number = reads.getName().replaceAll(/.*(part_\d+).fastq.gz/, '$1')
+                    new_meta.reference = meta2.id
+                    reads: [ new_meta, reads ] 
+                    index: [ meta2, index ]
+            } 
+
+    } else {
+
+        ch_input_for_mapping = reads
                             .combine(index)
                             .multiMap {
                                 meta, reads, meta2, index ->
@@ -29,6 +51,8 @@ workflow MAP {
                                     index: [ meta2, index]
                             }
 
+    }
+
     if ( params.mapping_tool == 'bwaaln' ) {
         FASTQ_ALIGN_BWAALN ( ch_input_for_mapping.reads, ch_input_for_mapping.index )
 
@@ -59,8 +83,7 @@ workflow MAP {
     ch_input_for_lane_merge = ch_mapped_lane_bam
                                 .map {
                                     meta, bam ->
-                                    new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry'] }
-
+                                    new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry', 'shard_number'] }
                                     [ new_meta, bam ]
                                 }
                                 .groupTuple()