From 6efc51741943c8e2e3b6e6e9684e33594d01163f Mon Sep 17 00:00:00 2001 From: Trestan Pillonel Date: Mon, 16 Oct 2023 15:22:30 +0200 Subject: [PATCH] WIP clean large files --- bin/clean_work_files.sh | 33 +++++++++++++++++ modules/local/clean_work.nf | 37 +++++++++++++++++++ .../main.nf | 7 ++-- 3 files changed, 73 insertions(+), 4 deletions(-) create mode 100755 bin/clean_work_files.sh create mode 100644 modules/local/clean_work.nf diff --git a/bin/clean_work_files.sh b/bin/clean_work_files.sh new file mode 100755 index 0000000..62fcafe --- /dev/null +++ b/bin/clean_work_files.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# https://raw.githubusercontent.com/SystemsGenetics/GEMmaker/master/bin/clean_work_files.sh +# This script is meant for cleaning any file in a Nextflow work directory. +# The $files_list variable is set within the Nextflow process and should +# contain the list of files that need cleaning. This can be done by creating +# a channel in a process that creates files, and merging that channel with +# a signal from another process indicating the files are ready for cleaning. +# +# The cleaning process empties the file, converts it to a sparse file so it +# has an acutal size of zero but appears as the original size, the access +# and modify times are kept the same. +files_list="$1" + +for file in ${files_list}; do + # Remove cruff added by Nextflow + file=`echo $file | perl -p -e 's/[\\[,\\]]//g'` + if [ -e $file ]; then + # Log some info about the file for debugging purposes + echo "cleaning $file" + stat $file + # Get file info: size, access and modify times + size=`stat --printf="%s" $file` + atime=`stat --printf="%X" $file` + mtime=`stat --printf="%Y" $file` + + # Make the file size 0 and set as a sparse file + > $file + truncate -s $size $file + # Reset the timestamps on the file + touch -a -d @$atime $file + touch -m -d @$mtime $file + fi +done \ No newline at end of file diff --git a/modules/local/clean_work.nf b/modules/local/clean_work.nf new file mode 100644 index 0000000..64eb10d --- /dev/null +++ b/modules/local/clean_work.nf @@ -0,0 +1,37 @@ + +process clean_work_dirs { + input: + tuple val(directory) + + output: + val(1), emit: IS_CLEAN + + script: + """ + for dir in ${directory}; do + if [ -e \$dir ]; then + echo "Cleaning: \$dir" + files=`find \$dir -type f ` + echo "Files to delete: \$files" + clean_work_files.sh "\$files" "null" + fi + done + """ +} + +process clean_work_files { + + cache 'lenient' + + input: + val(file) + + output: + val(1), emit: IS_CLEAN + + script: + """ + clean_work_files.sh "${file}" + """ +} + diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf index 35040aa..52435c2 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -1,7 +1,7 @@ include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/custom/sratoolsncbisettings/main' include { SRATOOLS_PREFETCH } from '../../../modules/nf-core/sratools/prefetch/main' include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/sratools/fasterqdump/main' -include { clean_work_dirs as CLEAN_SRA } from '../../../modules/local/clean_work.nf' +include { clean_work_dirs as CLEAN_SRA_DIR } from '../../../modules/local/clean_work.nf' // // Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). @@ -34,15 +34,14 @@ workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, ch_ncbi_settings, ch_dbgap_key ) ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) - SRATOOLS_PREFETCH.out.sra.join(SRATOOLS_FASTERQDUMP.out.reads).map{it[0]}.set{sra_to_clean} + SRATOOLS_PREFETCH.out.sra.join(SRATOOLS_FASTERQDUMP.out.reads).map{it[1]}.set{sra_to_clean} sra_to_clean.view() if( params.delete_intermediates ) { - CLEAN_SRA(sra_to_clean) + CLEAN_SRA_DIR(sra_to_clean) } - emit: reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ]