From 6efc51741943c8e2e3b6e6e9684e33594d01163f Mon Sep 17 00:00:00 2001
From: Trestan Pillonel <trestan.pillonel@gmail.com>
Date: Mon, 16 Oct 2023 15:22:30 +0200
Subject: [PATCH] WIP clean large files

---
 bin/clean_work_files.sh                       | 33 +++++++++++++++++
 modules/local/clean_work.nf                   | 37 +++++++++++++++++++
 .../main.nf                                   |  7 ++--
 3 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100755 bin/clean_work_files.sh
 create mode 100644 modules/local/clean_work.nf

diff --git a/bin/clean_work_files.sh b/bin/clean_work_files.sh
new file mode 100755
index 0000000..62fcafe
--- /dev/null
+++ b/bin/clean_work_files.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# https://raw.githubusercontent.com/SystemsGenetics/GEMmaker/master/bin/clean_work_files.sh
+# This script is meant for cleaning any file in a Nextflow work directory.
+# The $files_list variable is set within the Nextflow process and should
+# contain the list of files that need cleaning. This can be done by creating
+# a channel in a process that creates files, and merging that channel with
+# a signal from another process indicating the files are ready for cleaning.
+#
+# The cleaning process empties the file, converts it to a sparse file so it
+# has an acutal size of zero but appears as the original size, the access
+# and modify times are kept the same.
+files_list="$1"
+
+for file in ${files_list}; do
+  # Remove cruff added by Nextflow
+  file=`echo $file | perl -p -e 's/[\\[,\\]]//g'`
+  if [ -e $file ]; then
+    # Log some info about the file for debugging purposes
+    echo "cleaning $file"
+    stat $file
+    # Get file info: size, access and modify times
+    size=`stat --printf="%s" $file`
+    atime=`stat --printf="%X" $file`
+    mtime=`stat --printf="%Y" $file`
+
+    # Make the file size 0 and set as a sparse file
+    > $file
+    truncate -s $size $file
+    # Reset the timestamps on the file
+    touch -a -d @$atime $file
+    touch -m -d @$mtime $file
+  fi
+done
\ No newline at end of file
diff --git a/modules/local/clean_work.nf b/modules/local/clean_work.nf
new file mode 100644
index 0000000..64eb10d
--- /dev/null
+++ b/modules/local/clean_work.nf
@@ -0,0 +1,37 @@
+
+process clean_work_dirs {
+  input:
+  tuple val(directory)
+
+  output:
+  val(1), emit: IS_CLEAN
+
+  script:
+  """
+  for dir in ${directory}; do
+  if [ -e \$dir ]; then
+    echo "Cleaning: \$dir"
+    files=`find \$dir -type f `
+    echo "Files to delete: \$files"
+    clean_work_files.sh "\$files" "null"
+  fi
+  done
+  """
+}
+
+process clean_work_files {
+
+  cache 'lenient'
+
+  input:
+  val(file)
+
+  output:
+  val(1), emit: IS_CLEAN
+
+  script:
+  """
+    clean_work_files.sh "${file}"
+  """
+}
+
diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf
index 35040aa..52435c2 100644
--- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf
+++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf
@@ -1,7 +1,7 @@
 include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/custom/sratoolsncbisettings/main'
 include { SRATOOLS_PREFETCH           } from '../../../modules/nf-core/sratools/prefetch/main'
 include { SRATOOLS_FASTERQDUMP        } from '../../../modules/nf-core/sratools/fasterqdump/main'
-include { clean_work_dirs as CLEAN_SRA } from '../../../modules/local/clean_work.nf'
+include { clean_work_dirs as CLEAN_SRA_DIR } from '../../../modules/local/clean_work.nf'
 
 //
 // Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA).
@@ -34,15 +34,14 @@ workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS {
     SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, ch_ncbi_settings, ch_dbgap_key )
     ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first())
 
-    SRATOOLS_PREFETCH.out.sra.join(SRATOOLS_FASTERQDUMP.out.reads).map{it[0]}.set{sra_to_clean}
+    SRATOOLS_PREFETCH.out.sra.join(SRATOOLS_FASTERQDUMP.out.reads).map{it[1]}.set{sra_to_clean}
 
     sra_to_clean.view()
 
     if( params.delete_intermediates ) {                                                               
-        CLEAN_SRA(sra_to_clean)                                                                       
+        CLEAN_SRA_DIR(sra_to_clean)                                                                       
     }
 
-
     emit:
     reads    = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ]
     versions = ch_versions                    // channel: [ versions.yml ]