nf-core · shyama-mama · Nov 11, 2023 · Jul 3, 2023 · Aug 25, 2023 · Aug 25, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
-- [#1006](https://github.com/nf-core/eager/issues/1006) Added feature to shard fastqs before mapping, allowing more flexibility in parallelisation of mapping. 
+- [#1006](https://github.com/nf-core/eager/issues/1006) Added feature to shard fastqs before mapping, allowing more flexibility in parallelisation of mapping.
 
 ### `Fixed`
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -34,7 +34,8 @@
 
   > Broad Institute (2019). Picard Toolkit. GitHub Repository: https://broadinstitute.github.io/picard/
 
-- [SeqKit](https://bioinf.shenwei.me/seqkit/)  
+- [SeqKit](https://bioinf.shenwei.me/seqkit/)
+
   > Shen, W., Le, S., Li, Y., & Hu, F. (2016). SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE, 11(10), e0163962. doi:[10.1371/journal.pone.0163962](https://doi.org/10.1371/journal.pone.0163962)
 
 - [bwa](https://doi.org/10.1093/bioinformatics/btp324)

diff --git a/conf/modules.config b/conf/modules.config
@@ -368,7 +368,7 @@ process {
     withName: SEQKIT_SPLIT2 {
         tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
         ext.prefix = "out"
-        ext.args = "-s ${params.shard_size}"
+        ext.args = "-s ${params.fastq_shard_size}"
         publishDir = [
             enabled: false
         ]
@@ -457,7 +457,7 @@ process {
         publishDir = [
             enabled: false
         ]
-        ext.args = { params.shard_fastq ? "-c -p" : "" }
+        ext.args = { params.run_fastq_sharding ? "-c -p" : "" }
     }
 
     withName: SAMTOOLS_SORT_MERGED_LANES {

diff --git a/conf/test.config b/conf/test.config
@@ -28,9 +28,9 @@ params {
     fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta'
 
     // Sharding FASTQ
-    shard_fastq                           = true
-    shard_size                             = 5000
-    
+    run_fastq_sharding                           = true
+    fastq_shard_size                             = 5000
+
     // BAM filtering
     run_bamfiltering                      = true
     bamfiltering_minreadlength            = 30

diff --git a/nextflow.config b/nextflow.config
@@ -33,8 +33,8 @@ params {
     multiqc_methods_description = null
 
     // Shard Fastq options
-    shard_fastq                = false
-    shard_size                 = 1000000
+    run_fastq_sharding                = false
+    fastq_shard_size                  = 1000000
 
     // bedtools options
     run_bedtools_coverage         = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -476,18 +476,18 @@
             "description": "Options for aligning reads against reference genome(s)",
             "default": "",
             "properties": {
-                "shard_fastq": {
+                "run_fastq_sharding": {
                     "type": "boolean",
                     "description": "Turn on sharding to split reads into smaller chunks before mapping",
                     "fa_icon": "fas fa-power-off",
                     "help_text": "Sharding reads before mapping can help parallelise mapping for very large FASTQs.  "
                 },
-                "shard_size": {
+                "fastq_shard_size": {
                     "type": "integer",
                     "default": 1000000,
                     "description": "Specify the number of reads in each shard when splitting.",
                     "fa_icon": "fas fa-arrows-alt-v",
-                    "help_text": "Make sure to choose a value that makes sense for your dataset. Small values can create many files. Needs `--shard_fastq`"
+                    "help_text": "Make sure to choose a value that makes sense for your dataset. Small values can create many files. Needs `--run_fastq_sharding`"
                 },
                 "mapping_tool": {
                     "type": "string",

diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf
@@ -6,7 +6,7 @@ include { SEQKIT_SPLIT2
 include { FASTQ_ALIGN_BWAALN                                                                                                  } from '../../subworkflows/nf-core/fastq_align_bwaaln/main'
 include { BWA_MEM                                                                                                             } from '../../modules/nf-core/bwa/mem/main'
 include { BOWTIE2_ALIGN                                                                                                       } from '../../modules/nf-core/bowtie2/align/main'
-include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LANES ; SAMTOOLS_MERGE as SAMTOOLS_MERGE_SHARDS                                    } from '../../modules/nf-core/samtools/merge/main'
+include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LANES                                                                              } from '../../modules/nf-core/samtools/merge/main'
 include { SAMTOOLS_SORT  as SAMTOOLS_SORT_MERGED_LANES                                                                        } from '../../modules/nf-core/samtools/sort/main'
 include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MEM; SAMTOOLS_INDEX as SAMTOOLS_INDEX_BT2; SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LANES } from '../../modules/nf-core/samtools/index/main'
 include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MAPPED                                                                       } from '../../modules/nf-core/samtools/flagstat/main'
@@ -20,39 +20,35 @@ workflow MAP {
     ch_versions       = Channel.empty()
     ch_multiqc_files  = Channel.empty()
 
-    if ( params.shard_fastq ) {
+    if ( params.run_fastq_sharding ) {
 
         ch_input_for_sharding = reads
 
         SEQKIT_SPLIT2( ch_input_for_sharding )
         ch_versions        = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() )
 
-        ch_input_for_mapping = SEQKIT_SPLIT2.out.reads
+        reads = SEQKIT_SPLIT2.out.reads
             .transpose()
-            .combine(index)
-            .multiMap {
-                meta, reads, meta2, index ->
+            .map {
+                meta, reads ->
                     new_meta = meta.clone()
-                    new_meta.shard_number = reads.getName().replaceAll(/.*(part_\d+).fastq.gz/, '$1')
-                    new_meta.reference = meta2.id
-                    reads: [ new_meta, reads ] 
-                    index: [ meta2, index ]
-            } 
-
-    } else {
-
-        ch_input_for_mapping = reads
-                            .combine(index)
-                            .multiMap {
-                                meta, reads, meta2, index ->
-                                    new_meta = meta.clone()
-                                    new_meta.reference = meta2.id
-                                    reads: [ new_meta, reads ]
-                                    index: [ meta2, index]
-                            }
+                    new_meta.shard_number = reads.getName().replaceAll(/.*(part_\d+).(?:fastq|fq).gz/, '$1')
+                    [ new_meta, reads ] 
+            }
+            .groupTuple()
 
     }
 
+    ch_input_for_mapping = reads
+        .combine(index)
+        .multiMap {
+            meta, reads, meta2, index ->
+                new_meta = meta.clone()
+                new_meta.reference = meta2.id  
+                reads: [ new_meta, reads ] 
+                index: [ meta2, index ]
+        }
+
     if ( params.mapping_tool == 'bwaaln' ) {
         FASTQ_ALIGN_BWAALN ( ch_input_for_mapping.reads, ch_input_for_mapping.index )