-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
131 lines (109 loc) · 5.89 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Seroba pipeline
Authors: Alejandra Hernandez-Segura
Organization: Rijksinstituut voor Volksgezondheid en Milieu (RIVM)
Department: Infektieziekteonderzoek, Diagnostiek en Laboratorium Surveillance (IDS), Bacteriologie (BPD)
Date: 26-04-2021
Documentation: https://github.com/RIVM-bioinformatics/Juno-seroba
Snakemake rules (in order of execution):
1 fastQC # Asses quality of raw reads.
2 trimmomatic # Trim low quality reads and adapter sequences.
3 fastQC # Asses quality of trimmed reads.
4 seroba # Serotyping Streptococcus pneumoniae
"""
#################################################################################
##### Import config file, sample_sheet and set output folder names #####
#################################################################################
import pathlib
import pprint
import os
import yaml
import json
#################################################################################
##### Load samplesheet, load genus dict and define output directory #####
#################################################################################
# SAMPLES is a dict with sample in the form sample > read number > file. E.g.: SAMPLES["sample_1"]["R1"] = "x_R1.gz"
SAMPLES = {}
with open(config["sample_sheet"]) as sample_sheet_file:
SAMPLES = yaml.safe_load(sample_sheet_file)
# OUT defines output directory for most rules.
OUT = config["out"]
#@#############################################################################
#@#### Processes #####
#@#############################################################################
###########################################################################
##### Data quality control and cleaning #####
###########################################################################
include: "bin/rules/fastqc_raw_data.smk"
include: "bin/rules/trimmomatic.smk"
include: "bin/rules/fastqc_clean_data.smk"
include: "bin/rules/multiqc.smk"
###########################################################################
##### Seroba serotyping #####
###########################################################################
include: "bin/rules/seroba.smk"
#@#############################################################################
#@#### Loggings before/after pipeline #####
#@#############################################################################
onstart:
try:
print("Checking if all specified files are accessible...")
important_files = [ config["sample_sheet"],
'files/trimmomatic_0.36_adapters_lists/NexteraPE-PE.fa' ]
for filename in important_files:
if not os.path.exists(filename):
raise FileNotFoundError(filename)
except FileNotFoundError as e:
print("This file is not available or accessible: %s" % e)
sys.exit(1)
else:
print("\tAll specified files are present!")
shell("""
mkdir -p {OUT}
mkdir -p {OUT}/audit_trail
echo -e "\nLogging pipeline settings..."
echo -e "\tGenerating methodological hash (fingerprint)..."
#TODO add the proper link to the repo
#echo -e "This is the link to the code used for this analysis:\thttps://github.com/RIVM-bioinformatics/Juno-seroba/tree/$(git log -n 1 --pretty=format:"%H")" > '{OUT}/audit_trail/log_git.txt'
echo -e "This code with unique fingerprint $(git log -n1 --pretty=format:"%H") was committed by $(git log -n1 --pretty=format:"%an <%ae>") at $(git log -n1 --pretty=format:"%ad")" >> '{OUT}/audit_trail/log_git.txt'
echo -e "\tGenerating full software list of current conda environment..."
conda list > '{OUT}/audit_trail/log_conda.txt'
echo -e "\tGenerating config file log..."
rm -f '{OUT}/audit_trail/log_config.txt'
touch '{OUT}/audit_trail/log_config.txt'
for file in config/*.yaml
do
echo -e "\n==> Contents of file \"${{file}}\": <==" >> '{OUT}/audit_trail/log_config.txt'
cat ${{file}} >> '{OUT}/audit_trail/log_config.txt'
echo -e "\n\n" >> '{OUT}/audit_trail/log_config.txt'
done
""")
onerror:
shell("""
echo "Something went wrong while running this pipeline. Please check the error messages and log files"
""")
onsuccess:
shell("""
rm -rf temp.*
echo -e "\tGenerating Snakemake report..."
snakemake --config out={OUT} sample_sheet="config/sample_sheet.yaml" min_cov=20 seroba_db=/mnt/db/seroba_db/database kmer_size=71 --configfile "config/pipeline_parameters.yaml" --cores 1 --unlock
snakemake --config out={OUT} sample_sheet="config/sample_sheet.yaml" min_cov=20 seroba_db=/mnt/db/seroba_db/database kmer_size=71 --configfile "config/pipeline_parameters.yaml" --cores 1 --report '{OUT}/audit_trail/snakemake_report.html'
if [ $? -eq 0 ]; then
echo -e "Pipeline finished successfully!"
else
echo "The Snakemake report could not be generated."
fi
""")
###############################################################################
##### Specify final output: #####
###############################################################################
localrules:
all
rule all:
input:
expand(OUT + "/qc_raw_fastq/{sample}_{read}_fastqc.zip", sample = SAMPLES, read = ['R1', 'R2']),
expand(OUT + "/clean_fastq/{sample}_{read}.fastq.gz", sample = SAMPLES, read = ['pR1', 'pR2', 'uR1', 'uR2']),
expand(OUT + "/qc_clean_fastq/{sample}_{read}_fastqc.zip", sample = SAMPLES, read = ['pR1', 'pR2']),
expand(OUT + "/serotype/{sample}/pred.tsv", sample = SAMPLES),
OUT + "/audit_trail/seroba_db.yaml",
OUT + "/multiqc/multiqc.html"