nf_cellranger

#!/usr/bin/env nextflow
nextflow.enable.dsl=2

// last modified 23 August 2021
params.outdir = "."
params.genome = ""
params.verbose = false
params.cellranger_args = ''
params.single_end = false // default mode is auto-detect

params.fastqs_dir = '.'
params.sample_sheet = 'samples.csv' // for testing purposes, not currently used

params.sample = ''
params.list_genomes = false;

params.id = '';
params.fastqs = '';

params.help = false
// Show help message and exit
if (params.help){
    helpMessage()
    exit 1
}

if (params.list_genomes){
    println ("[WORKLFOW] List genomes selected")
}

if (params.verbose){
    println ("[WORKFLOW] CELLRANGER ARGS ARE: "       + params.cellranger_args)
}

include { makeFilesChannel; getFileBaseNames } from './nf_modules/files.mod.nf'
include { getGenome }                          from './nf_modules/genomes.mod.nf'
include { listGenomes }                        from './nf_modules/genomes.mod.nf'

if (params.list_genomes){
    listGenomes()  // this lists all available genomes, and exits
}

genome = getGenome(params.genome)
include { CELLRANGER_COUNT }                   from './nf_modules/cellranger.mod.nf'   params(genome: genome)

// Not currently using a file channel for CellRanger runs, but requiring an option --sample sampleID
// file_ch = makeFilesChannel(args)
// file_ch.view()
// absolute_path = ''
// println("Absolute path:" + absolute_path)

if (params.sample && params.verbose){
    println ("[WORKFLOW] Cellranger sampleID: "       + params.sample)
}
else{
    println ("\nPlease specify a sample ID with --sample=Sample_Name\n\n(For the sample ID, please no illegal characters! The characters not allowed are the space character and the following: ? ( ) [ ] / \\ = + < > : ; ' \" , * ^ | & . ")
    exit 1
}

// Illumina FASTQ files use the following naming scheme when processed with bcl2fastq:
// <sample name>_<barcode sequence>_L<lane>_R<read number>_<setnumber>.fastq.gz

// Illegal Characters
// Project and sample names in the sample sheet cannot contain illegal characters not
// allowed by some file systems. The characters not allowed are the space character and the
// following:
// ? ( ) [ ] / \ = + < > : ; " ' , * ^ | & .

// Let's assume these are some sample files:
// SIGAG5_9_dnmt3ab_DKO_S2_L005_I1_001.fastq.gz
// SIGAG5_9_dnmt3ab_DKO_S2_L005_R1_001.fastq.gz
// SIGAG5_9_dnmt3ab_DKO_S2_L005_R2_001.fastq.gz

// Then this could be a typical Cellranger count run:
// cellranger count --id='SIGAG5_9_dnmt3ab_DKO' \
//             --fastqs='/bi/cellranger/testrun/' \
//             --transcriptome='/bi/cellranger/references/' \
//             --sample=SIGAG5_9_dnmt3ab_DKO \
//             --localcores=16 \
//             --localmem=35

workflow {
    main:
        
        // This pipeline is different to our usual pipelines, as it does not necessarily require the input filenames
        // but only a sample name. In the future, we could work on an automation to extract the sample names if there
        // are several different sets of files in the same folder

        CELLRANGER_COUNT (params.sample, params.fastqs_dir ,params.outdir, params.cellranger_args, params.verbose)

        // The following code does not work, but I'll it here for the time being
        // file_ch
        // subscribe a function to the channel printing the emitted values
        // .map {absolute_path -> it[1][0]}
        // .subscribe onNext:{  println "Got: $it\n and PATH: $absolute_path"  }
        // println("Absolute path now:" + absolute_path)
        //   CELLRANGER_COUNT (file_ch, params.outdir, params.cellranger_args, params.verbose)
}

// Since workflows with very long command lines tend to fail to get rendered at all, I was experimenting with a
// minimal execution summary report so we at least know what the working directory was...
workflow.onComplete {

    def msg = """\
        Pipeline execution summary
        ---------------------------
        Jobname     : ${workflow.runName}
        Success     : ${workflow.success}
        Command Line: ${workflow.commandLine}
        Completed at: ${workflow.complete}
        Duration    : ${workflow.duration}
        Success     : ${workflow.success}
        workDir     : ${workflow.workDir}
        exit status : ${workflow.exitStatus}
        Nextflow version: ${workflow.nextflow.version}
        """
    .stripIndent()

    sendMail(to: "${workflow.userName}@babraham.ac.uk", subject: 'Minimal pipeline execution report', body: msg)
}

def helpMessage() {
 
    log.info"""
    >>

    
    SYNOPSIS:

    This workflow takes in a list of filenames (in FastQ format), and aligns these files to a genome using CellRanger count using the
    stone compute cluster at Babraham.

    If you run Bismark in this stand-alone workflow it is assumed that you know what you are doing, e.g. raw FastQ files should
    have been trimmed appropriately. If called as is, Bismark is run in default mode. To add additional parameters, please 
    consider tool-specific arguments that are compatible with Bismark (see '--bismark_args' below).


            ==============================================================================================================


    USAGE:
    
    nf_cellranger [options] --genome <genomeID> <input files>
    
    Mandatory arguments:
    ====================

      <input files>                   List of input files, e.g. '*fastq.gz' or '*fq.gz'. In theory, files will automatically be
                                      processed as single-end or paired end files (if file pairs share the same base-name, and
                                      differ only by a different read number, e.g. 'base_name_R1.fastq.gz' and 'base_name_R2.fastq.gz'
                                      (, or R3, R4). To run paired-end files in single-end mode, please see '--single_end' below.

      --genome [str]                  Genome build ID to be used for the alignment, e.g. GRCh38 (latest human genome) or GRCm38
                                      (latest mouse genome build). To list all available genomes, see '--list_genomes' below.


    Tool-Specific Options:
    ======================

      --bismark_args="[str]"          This option can take any number of options that are compatible with Bismark to modify its
                                      default mapping behaviour. For more detailed information on available options please refer
                                      to the Bismark User Guide, or run 'bismark --help'. As an example, to run somewhat relaxed
                                      alignments for a PBAT library, use ' --bismark_args="--pbat --score_min L,0,-0.4" '. Please
                                      note that the format ="your options" needs to be strictly adhered to in order to work correctly.
                                      [Default: None]

      --fastqs_dir [str]              The folder where CellRanger expects the FastQ files. Default: '.' (i.e. current working directory)

    Other Options:
    ==============

      --outdir [str]                  Path to the output directory. [Default: current working directory]

      --list_genomes                  List all genome builds that are currently available to choose from. To see this list
                                      of available genomes with more detailed information about paths and indexes, run
                                      the command as '--list_genomes --verbose'
    
      --single_end                    Force files of a read pair to be treated as single-end files. [Default: auto-detect]
      
      --verbose                       More verbose status messages. [Default: OFF]
      --help                          Displays this help message and exits.

    Workflow Options:
    =================

    Please note the single '-' hyphen for the following options!

      -resume                         If a pipeline workflow has been interrupted or stopped (e.g. by accidentally closing a laptop),
                                      this option will attempt to resume the workflow at the point it got interrupted by using
                                      Nextflow's caching mechanism. This may save a lot of time.

      -bg                             Sends the entire workflow into the background, thus disconnecting it from the terminal session.
                                      This option launches a daemon process (which will keep running on the headnode) that watches over
                                      your workflow, and submits new jobs to the SLURM queue as required. Use this option for big pipeline
                                      jobs, or whenever you do not want to watch the status progress yourself. Upon completion, the
                                      pipeline will send you an email with the job details. This option is HIGHLY RECOMMENDED!

      -process.executor=local         Temporarily changes where the workflow is executed to the 'local' machine. Please also see the Nextflow
                                      config file for details. [Default: slurm] 
    

    <<
    """.stripIndent()

}