Skip to content

Commit

Permalink
Merge pull request #42 from aofarrel/qc_csv
Browse files Browse the repository at this point in the history
Add QC CSV final output, fix weird or incorrect QC filters
  • Loading branch information
aofarrel authored Nov 9, 2023
2 parents 86b174a + 64dfd37 commit b1e63eb
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 40 deletions.
10 changes: 10 additions & 0 deletions doc/style_guide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Style Guide (draft)

## Percentages/Ratios
User facing inputs and outputs that are a percentage are written as an integer or float between 0 and 100. Something that is 90% will be written as 90 or as 90.0, not 0.90
✔️ Int covstatsQC_max_percent_unmapped = 2
X Int covstatsQC_max_percent_unmapped = 0.2

## Units for disk size
Unless otherwise specified, user-facing storage units are in gigabytes. For example, variantcalling_addl_disk being 100 is interpreted as "add 100 GB to the disk calculation."

30 changes: 30 additions & 0 deletions lims_qc_csv.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

## implemented
sample
Sample ID
genome_pct_coverage
Percentage of the genome covered by the sample (earlyQC)
mean_coverage
Mean coverage (covstats)
median_coverage
Median coverage (earlyQC, specifically TBProfiler)
pct_above_q30
Percent of reads above q30 (earlyQC, specifically fastp)
reads_is_contam
Reads considered contaminated (and removed) by the decontaminator
reads_reference
Reads considered mapping to reference by the decontaminator
reads_unmapped
Reads considered unmapped by the decontaminator
status
The status of this sample

## ideas
warnings
Non-fatal warnings for this sample
out_pct_pass_fastp
Percent of reads that pass fastp's filters
coverage_cutoff
Coverage below this is considered "low coverage" wrt in_pct_low_cov
in_pct_low_cov
Percent of sites (CHECK!!) determined with coverage below coverage_cutoff; these sites get masked when creating diff files
101 changes: 61 additions & 40 deletions myco_raw.wdl
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
version 1.0

import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.11.1/tasks/combined_decontamination.wdl" as clckwrk_combonation
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.11.0/tasks/variant_call_one_sample.wdl" as clckwrk_var_call
import "https://raw.githubusercontent.com/aofarrel/SRANWRP/v1.1.12/tasks/processing_tasks.wdl" as sranwrp_processing
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.11.3/tasks/combined_decontamination.wdl" as clckwrk_combonation
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.11.3/tasks/variant_call_one_sample.wdl" as clckwrk_var_call
import "https://raw.githubusercontent.com/aofarrel/SRANWRP/v1.1.17/tasks/processing_tasks.wdl" as sranwrp_processing
import "https://raw.githubusercontent.com/aofarrel/tree_nine/0.0.10/tree_nine.wdl" as build_treesWF
import "https://raw.githubusercontent.com/aofarrel/parsevcf/1.2.0/vcf_to_diff.wdl" as diff
import "https://raw.githubusercontent.com/aofarrel/tb_profiler/0.2.2/tbprofiler_tasks.wdl" as profiler
import "https://raw.githubusercontent.com/aofarrel/TBfastProfiler/0.0.10/neoTBfastProfiler.wdl" as qc_fastqsWF # aka earlyQC
import "https://raw.githubusercontent.com/aofarrel/TBfastProfiler/0.0.11/neoTBfastProfiler.wdl" as qc_fastqsWF # aka earlyQC
import "https://raw.githubusercontent.com/aofarrel/goleft-wdl/0.1.2/goleft_functions.wdl" as goleft

workflow myco {
input {
Array[Array[File]] paired_fastq_sets

Int covstatsQC_minimum_coverage = 10
Int covstatsQC_max_percent_unmapped= 2
Int covstatsQC_min_coverage = 10
Int covstatsQC_min_pct_unmapped = 2
Boolean covstatsQC_skip_entirely = false
Boolean decontam_use_CDC_varpipe_ref = true
File? diffQC_mask_bedfile
Int diffQC_max_percent_low_coverage= 20
Int diffQC_low_coverage_cutoff = 10
Int earlyQC_minimum_percent_q30 = 90
Int diffQC_max_pct_low_coverage = 20
Int diffQC_this_is_low_coverage = 10
Int earlyQC_min_q30_rate = 90
Boolean earlyQC_skip_entirely = false
Boolean earlyQC_skip_QC = false
Boolean earlyQC_skip_trimming = false
Expand All @@ -29,7 +29,7 @@ workflow myco {
Int subsample_cutoff = -1
Int subsample_seed = 1965
Boolean tbprofiler_on_bam = false
Float tbprofilerQC_max_pct_unmapped = 2
Int tbprofilerQC_min_pct_mapped = 98
Int timeout_decontam_part1 = 0
Int timeout_decontam_part2 = 0
Int timeout_variant_caller = 0
Expand All @@ -48,23 +48,23 @@ workflow myco {
}

parameter_meta {
covstatsQC_minimum_coverage: "If covstats thinks MEAN coverage is below this, throw out this sample - not to be confused with TBProfiler MEDIAN coverage"
covstatsQC_max_percent_unmapped: "If covstats thinks this percent (as int, 50 = 50%) of data does not map to H37Rv, throw out this sample"
covstatsQC_min_coverage: "If covstats thinks MEAN coverage is below this, throw out this sample - not to be confused with TBProfiler MEDIAN coverage"
covstatsQC_min_pct_unmapped: "If covstats thinks less than this percent (as int, 50 = 50%) of data does not map to H37Rv, throw out this sample"
covstatsQC_skip_entirely: "Should we skip covstats entirely?"
diffQC_mask_bedfile: "Bed file of regions to mask when making diff files (default: R00000039_repregions.bed)"
diffQC_max_percent_low_coverage: "Samples who have more than this percent (as int, 50 = 50%) of positions with coverage below diffQC_low_coverage_cutoff will be discarded"
diffQC_low_coverage_cutoff: "Positions with coverage below this value will be masked in diff files"
earlyQC_minimum_percent_q30: "Decontaminated samples with less than this percent (as int, 50 = 50%) of reads above qual score of 30 will be discarded. Negated by earlyQC_skip_QC or earlyQC_skip_entirely being false."
diffQC_max_pct_low_coverage: "Samples who have more than this percent (as int, 50 = 50%) of positions with coverage below diffQC_this_is_low_coverage will be discarded"
diffQC_this_is_low_coverage: "Positions with coverage below this value will be masked in diff files"
earlyQC_min_q30_rate: "Decontaminated samples with less than this percent (as int, 50 = 50%) of reads above qual score of 30 will be discarded. Negated by earlyQC_skip_QC or earlyQC_skip_entirely being false."
earlyQC_skip_entirely: "Do not run earlyQC at all - no trimming, no QC, nothing."
earlyQC_skip_QC: "Run earlyQC (unless earlyQC_skip_entirely is true), but do not throw out samples that fail QC. Independent of earlyQC_skip_trimming."
earlyQC_skip_trimming: "Run earlyQC (unless earlyQC_skip_entirely is true), and remove samples that fail QC (unless earlyQC_skip_QC is true), but do not use fastp's cleaned fastqs."
earlyQC_trim_qual_below: "Trim reads with an average quality score below this value. Independent of earlyQC_minimum_percent_q30. Overridden by earlyQC_skip_trimming or earlyQC_skip_entirely being true."
earlyQC_trim_qual_below: "Trim reads with an average quality score below this value. Independent of earlyQC_min_q30_rate. Overridden by earlyQC_skip_trimming or earlyQC_skip_entirely being true."
quick_tasks_disk_size: "Disk size in GB to use for quick file-processing tasks; increasing this might slightly speed up file localization"
paired_fastq_sets: "Nested array of paired fastqs, each inner array representing one samples worth of paired fastqs"
subsample_cutoff: "If a fastq file is larger than than size in MB, subsample it with seqtk (set to -1 to disable)"
subsample_seed: "Seed used for subsampling with seqtk"
tbprofiler_on_bam: "If true, run TBProfiler on BAMs"
tbprofilerQC_max_pct_unmapped: "If tbprofiler thinks this percent (as float, 50 = 50%) of data does not map to H37Rv, throw out this sample"
tbprofilerQC_min_pct_mapped: "If tbprofiler thinks less than this percent (as int, 50 = 50%) of data does map to H37Rv, throw out this sample"
timeout_decontam_part1: "Discard any sample that is still running in clockwork map_reads after this many minutes (set to 0 to never timeout)"
timeout_decontam_part2: "Discard any sample that is still running in clockwork rm_contam after this many minutes (set to 0 to never timeout)"
timeout_variant_caller: "Discard any sample that is still running in clockwork variant_call_one_sample after this many minutes (set to 0 to never timeout)"
Expand All @@ -74,10 +74,9 @@ workflow myco {

String pass = "PASS" # used later... much later
# convert percent integers to floats (except covstatsQC_max_percent_unmapped)
Float diffQC_max_percent_low_coverage_float = diffQC_max_percent_low_coverage / 100.0
Float earlyQC_minimum_percent_q30_float = earlyQC_minimum_percent_q30 / 100.0
Float tbprofilerQC_max_percent_unmapped_float = tbprofilerQC_max_pct_unmapped / 100.0
# flip some QC stuff around
Float diffQC_max_pct_low_coverage_float = diffQC_max_pct_low_coverage / 100.0
Int covstatsQC_max_percent_unmapped = 100 - covstatsQC_min_pct_unmapped

scatter(paired_fastqs in paired_fastq_sets) {
call clckwrk_combonation.combined_decontamination_single_ref_included as decontam_each_sample {
Expand All @@ -96,17 +95,21 @@ workflow myco {
# select_first() where the first element is the File? we know must exist, and the second element is bogus.
File real_decontaminated_fastq_1=select_first([decontam_each_sample.decontaminated_fastq_1, paired_fastqs[0]])
File real_decontaminated_fastq_2=select_first([decontam_each_sample.decontaminated_fastq_2, paired_fastqs[0]])

if((decontam_each_sample.reads_kept < 5000)) {
String warning_decontam = "DECONTAMINATION_ONLY" + decontam_each_sample.reads_kept + "_READS_REMAINING_(MIN_" + 5000 + ")" #!StringCoercion
}

if(!earlyQC_skip_entirely) {
call qc_fastqsWF.TBfastProfiler as qc_fastqs {
input:
fastq1 = real_decontaminated_fastq_1,
fastq2 = real_decontaminated_fastq_2,
q30_cutoff = earlyQC_minimum_percent_q30_float,
minimum_q30_rate = earlyQC_min_q30_rate,
average_qual = earlyQC_trim_qual_below,
use_fastps_cleaned_fastqs = !(earlyQC_skip_trimming),
soft_all_qc = earlyQC_skip_QC,
pct_mapped_cutoff = tbprofilerQC_max_percent_unmapped_float
minimum_pct_mapped = tbprofilerQC_min_pct_mapped
}
# if this sample passes fastp, or if earlyQC_skip_QC is true...
if(qc_fastqs.status_code == pass) {
Expand Down Expand Up @@ -211,20 +214,19 @@ workflow myco {
}

if(covstats.percentUnmapped < covstatsQC_max_percent_unmapped) {
if(covstats.coverage > covstatsQC_minimum_coverage) {
if(covstats.coverage > covstatsQC_min_coverage) {

# make diff files
call diff.make_mask_and_diff as make_mask_and_diff_after_covstats {
input:
bam = vcfs_and_bams.left[0],
vcf = vcfs_and_bams.right,
min_coverage_per_site = diffQC_low_coverage_cutoff,
min_coverage_per_site = diffQC_this_is_low_coverage,
tbmf = diffQC_mask_bedfile,
max_ratio_low_coverage_sites_per_sample = diffQC_max_percent_low_coverage_float
max_ratio_low_coverage_sites_per_sample = diffQC_max_pct_low_coverage_float
}
}
}

}

if(covstatsQC_skip_entirely) {
Expand All @@ -234,9 +236,9 @@ workflow myco {
input:
bam = vcfs_and_bams.left[0],
vcf = vcfs_and_bams.right,
min_coverage_per_site = diffQC_low_coverage_cutoff,
min_coverage_per_site = diffQC_this_is_low_coverage,
tbmf = diffQC_mask_bedfile,
max_ratio_low_coverage_sites_per_sample = diffQC_max_percent_low_coverage_float
max_ratio_low_coverage_sites_per_sample = diffQC_max_pct_low_coverage_float
}
}

Expand Down Expand Up @@ -364,8 +366,8 @@ workflow myco {
# for an individual sample as workflow-level output, which gets written to the Terra data table.
# is there only one sample?
if(length(paired_fastq_sets) == 1) {
if(length(paired_fastq_sets) == 1) {

# did the decontamination step actually run? (note that defined() is not a robust check, but since this is the first task
# in the workflow this should be okay for now)
if(defined(decontam_each_sample.errorcode)) {
Expand Down Expand Up @@ -426,9 +428,9 @@ workflow myco {
Float meanCoverage = meanCoverages[0]

if(percentUnmapped > covstatsQC_max_percent_unmapped) { String too_many_unmapped = "COVSTATS_LOW_PCT_MAPPED_TO_REF"
if(meanCoverage < covstatsQC_minimum_coverage) { String double_bad = "COVSTATS_BAD_MAP_AND_COVERAGE" }
if(meanCoverage < covstatsQC_min_coverage) { String double_bad = "COVSTATS_BAD_MAP_AND_COVERAGE" }
}
if(meanCoverage < covstatsQC_minimum_coverage) { String too_low_coverage = "COVSTATS_LOW_MEAN_COVERAGE" }
if(meanCoverage < covstatsQC_min_coverage) { String too_low_coverage = "COVSTATS_LOW_MEAN_COVERAGE" }
}
#}
}
Expand All @@ -450,7 +452,28 @@ workflow myco {
# final-final-final error code
# earlyQC is at the end (but before PASS) to account for earlyQC_skip_QC = true
String finalcode = select_first([decontam_ERR, varcall_ERR, covstats_ERR, vcfdiff_ERR, earlyQC_ERR, pass])

}

# miniwdl check will allow using just one flatten() here, but womtool will not. per the spec, flatten() isn't recursive.
# TODO: this is still breaking in Cromwell!
# Failed to evaluate 'warnings' (reason 1 of 1): Evaluating flatten(flatten([[select_all(qc_fastqs.warning_codes)], [select_all(warning_decontam)]])) failed: No coercion defined from wom value(s) '[["EARLYQC_88.112_PCT_ABOVE_Q30_(MIN_0.9)", "EARLYQC_99.61_PCT_MAPPED_(MIN_99.995)"]]' of type 'Array[Array[String]]' to 'Array[String]'.
#Array[String] warnings = flatten(flatten([[select_all(qc_fastqs.warning_codes)], [select_all(warning_decontam)]]))
Map[String, String] metrics_to_values = {
"status": select_first([finalcode, "NA"]),
"reads_is_contam": select_first([decontam_each_sample.reads_is_contam[0], "NA"]), # decontamination
"reads_reference": select_first([decontam_each_sample.reads_reference[0], "NA"]), # decontamination
"reads_unmapped": select_first([decontam_each_sample.reads_unmapped[0], "NA"]), # decontamination
"pct_above_q30": select_first([qc_fastqs.pct_above_q30[0], "NA"]), # fastp
"median_coverage": select_first([qc_fastqs.median_coverage[0], "NA"]), # thiagen!TBProfiler
"genome_pct_coverage": select_first([qc_fastqs.pct_genome_covered[0], "NA"]), # thiagen!TBProfiler
"mean_coverage": select_first([meanCoverage, "NA"]) # covstats
}

call sranwrp_processing.map_to_tsv_or_csv as qc_summary {
input:
the_map = metrics_to_values,
column_names = if length(paired_fastq_sets) == 1 then [basename(paired_fastq_sets[0][0])] else ["sample"]
}

output {
Expand Down Expand Up @@ -499,7 +522,8 @@ workflow myco {
Array[File]? trees_nextstrain = trees.subtrees_nextstrain

# useful debugging/run information (only valid iff this ran on only one sample)
Array[String]? pass_or_warnings = qc_fastqs.warning_codes[0]
File qc_csv = qc_summary.tsv_or_csv
#Array[String] pass_or_warnings = if (length(warnings) > 0) then warnings else ["PASS"]
String? debug_decontam_ERR = decontam_ERR
String? debug_earlyQC_ERR = earlyQC_ERR
String? debug_varcall_ERR = varcall_ERR
Expand All @@ -508,11 +532,8 @@ workflow myco {
Array[String]? debug_vcfdiff_errorcode_if_covstats = vcfdiff_errorcode_if_covstats
Array[String]? debug_vcfdiff_errorcode_if_no_covstats = vcfdiff_errorcode_if_no_covstats
Array[String]? debug_vcfdiff_errorcode_array = vcfdiff_errorcode_array
Int seconds_to_untar = decontam_each_sample.seconds_to_untar[0]
Int seconds_to_map_reads = decontam_each_sample.seconds_to_map_reads[0]
Int seconds_to_sort = decontam_each_sample.seconds_to_sort[0]
Int seconds_to_rm_contam = decontam_each_sample.seconds_to_rm_contam[0]
Int seconds_total = decontam_each_sample.seconds_total[0]
Int seconds_to_map_reads = decontam_each_sample.timer_map_reads[0]
Int seconds_to_rm_contam = decontam_each_sample.timer_rm_contam[0]
String docker_used = decontam_each_sample.docker_used[0]
}
}
Expand Down

0 comments on commit b1e63eb

Please sign in to comment.