diff --git a/after.py b/after.py index aa47781..1716be5 100755 --- a/after.py +++ b/after.py @@ -9,7 +9,7 @@ import copy from util import * -AFTERQC_VERSION = "0.9.3" +AFTERQC_VERSION = "0.9.5" def parseCommand(): usage = "Automatic Filtering, Trimming, Error Removing and Quality Control for Illumina fastq data \n\nSimplest usage:\ncd to the folder containing your fastq data, run " diff --git a/preprocesser.py b/preprocesser.py index e094e3c..7128da3 100755 --- a/preprocesser.py +++ b/preprocesser.py @@ -688,38 +688,74 @@ def run(self): reporter.addFigure('Good reads and bad reads after filtering', self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS, 'filter_stat'), 'filter_stat', "") #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png")) + #squeeze qc data for JSON output + self.r1qc_prefilter.squeeze() + self.r1qc_postfilter.squeeze() + if self.options.read2_file != None: + self.r2qc_prefilter.squeeze() + self.r2qc_postfilter.squeeze() + stat={} # stat["options"]=self.options - stat["summary"]=result + stat["afterqc_main_summary"]=result stat["command"]=makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"]["read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10] stat["kmer_content"]["read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10] + + # output more data in JSON file for offline plotting directly from JSON + stat["base_quality"] = {} + stat["base_quality"]["read1_prefilter"] = self.r1qc_prefilter.baseMeanQual + stat["base_quality"]["read1_postfilter"] = self.r1qc_postfilter.baseMeanQual + stat["mean_quality"] = {} + stat["mean_quality"]["read1_prefilter"] = self.r1qc_prefilter.meanQual + stat["mean_quality"]["read1_postfilter"] = self.r1qc_postfilter.meanQual + stat["base_content"] = {} + stat["base_content"]["read1_prefilter"] = self.r1qc_prefilter.percents + stat["base_content"]["read1_postfilter"] = self.r1qc_postfilter.percents + stat["gc_content"] = {} + stat["gc_content"]["read1_prefilter"] = self.r1qc_prefilter.gcPercents + stat["gc_content"]["read1_postfilter"] = self.r1qc_postfilter.gcPercents + if self.options.read2_file != None: stat["kmer_content"]["read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"]["read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10] - stat["overlap"]={} - stat["overlap"]['overlapped_pairs']=OVERLAPPED + + + stat["base_quality"]["read2_prefilter"] = self.r2qc_prefilter.baseMeanQual + stat["base_quality"]["read2_postfilter"] = self.r2qc_postfilter.baseMeanQual + + stat["mean_quality"]["read2_prefilter"] = self.r2qc_prefilter.meanQual + stat["mean_quality"]["read2_postfilter"] = self.r2qc_postfilter.meanQual + + stat["base_content"]["read2_prefilter"] = self.r2qc_prefilter.percents + stat["base_content"]["read2_postfilter"] = self.r2qc_postfilter.percents + + stat["gc_content"]["read2_prefilter"] = self.r2qc_prefilter.gcPercents + stat["gc_content"]["read2_postfilter"] = self.r2qc_postfilter.gcPercents + + stat["afterqc_overlap"]={} + stat["afterqc_overlap"]['overlapped_pairs']=OVERLAPPED if OVERLAPPED > 0: - stat["overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED) + stat["afterqc_overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED) else: - stat["overlap"]['average_overlap_length']=0.0 - stat["overlap"]['bad_mismatch_reads']=BADMISMATCH - stat["overlap"]['bad_diff']=BADDIFF - stat["overlap"]['bad_indel_reads']=BADINDEL - stat["overlap"]['corrected_reads']=READ_CORRECTED - stat["overlap"]['corrected_bases']=BASE_CORRECTED - stat["overlap"]['skipped_correction_bases']=BASE_SKIPPED_CORRECTION - stat["overlap"]['zero_qual_masked']=BASE_ZERO_QUAL_MASKED - stat["overlap"]['zero_qual_skipped']=BASE_ZERO_QUAL_MASKED - stat["overlap"]['trimmed_adapter_bases']=TRIMMED_ADAPTER_BASE - stat["overlap"]['trimmed_adapter_reads']=TRIMMED_ADAPTER_READ + stat["afterqc_overlap"]['average_overlap_length']=0.0 + stat["afterqc_overlap"]['bad_mismatch_reads']=BADMISMATCH + stat["afterqc_overlap"]['bad_diff']=BADDIFF + stat["afterqc_overlap"]['bad_indel_reads']=BADINDEL + stat["afterqc_overlap"]['corrected_reads']=READ_CORRECTED + stat["afterqc_overlap"]['corrected_bases']=BASE_CORRECTED + stat["afterqc_overlap"]['skipped_correction_bases']=BASE_SKIPPED_CORRECTION + stat["afterqc_overlap"]['zero_qual_masked']=BASE_ZERO_QUAL_MASKED + stat["afterqc_overlap"]['zero_qual_skipped']=BASE_ZERO_QUAL_MASKED + stat["afterqc_overlap"]['trimmed_adapter_bases']=TRIMMED_ADAPTER_BASE + stat["afterqc_overlap"]['trimmed_adapter_reads']=TRIMMED_ADAPTER_READ if OVERLAP_BASE_SUM > 0: - stat["overlap"]['error_rate']=float(OVERLAP_BASE_ERR)/float(OVERLAP_BASE_SUM) + stat["afterqc_overlap"]['error_rate']=float(OVERLAP_BASE_ERR)/float(OVERLAP_BASE_SUM) else: - stat["overlap"]['error_rate']=0.0 - stat["overlap"]['error_matrix']=OVERLAP_ERR_MATRIX - stat["overlap"]['edit_distance_histogram']=distance_histgram[0:10] + stat["afterqc_overlap"]['error_rate']=0.0 + stat["afterqc_overlap"]['error_matrix']=OVERLAP_ERR_MATRIX + stat["afterqc_overlap"]['edit_distance_histogram']=distance_histgram[0:10] reporter.addFigure('Sequence error distribution', self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX, 'error_matrix'), 'error_matrix', "") reporter.addFigure('Overlap length distribution', self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen, TOTAL_READS, 'overlap_stat'), 'overlap_stat', "") #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png")) diff --git a/qcreporter.py b/qcreporter.py index 0a73c3d..9907782 100644 --- a/qcreporter.py +++ b/qcreporter.py @@ -78,7 +78,7 @@ def outputRow(self, io, k, v): io.write("" + str(k) + "" + str(v) + "\n") def getSequencing(self): - ret = str(self.stat["summary"]["readlen"]) + ret = str(self.stat["afterqc_main_summary"]["readlen"]) if self.stat["command"]["read2_file"] != None: ret = "2*" + ret + " pair end" else: @@ -91,14 +91,14 @@ def outputSummary(self, io): io.write("\n") self.outputRow(io, "AfterQC Version:", self.version) self.outputRow(io, "sequencing:", self.getSequencing()) - self.outputRow(io, "total reads:", self.stat["summary"]["total_reads"]) - self.outputRow(io, "filtered out reads:", str(self.stat["summary"]["bad_reads"]) + " (" + str(100.0 * float(self.stat["summary"]["bad_reads"])/float(self.stat["summary"]["total_reads"])) + "%)") - self.outputRow(io, "total bases:", self.stat["summary"]["total_bases"]) - self.outputRow(io, "filtered out bases:", str(self.stat["summary"]["total_bases"] - self.stat["summary"]["good_bases"]) + " (" + str(100.0 * float(self.stat["summary"]["total_bases"] - self.stat["summary"]["good_bases"])/float(self.stat["summary"]["total_bases"])) + "%)") + self.outputRow(io, "total reads:", self.stat["afterqc_main_summary"]["total_reads"]) + self.outputRow(io, "filtered out reads:", str(self.stat["afterqc_main_summary"]["bad_reads"]) + " (" + str(100.0 * float(self.stat["afterqc_main_summary"]["bad_reads"])/float(self.stat["afterqc_main_summary"]["total_reads"])) + "%)") + self.outputRow(io, "total bases:", self.stat["afterqc_main_summary"]["total_bases"]) + self.outputRow(io, "filtered out bases:", str(self.stat["afterqc_main_summary"]["total_bases"] - self.stat["afterqc_main_summary"]["good_bases"]) + " (" + str(100.0 * float(self.stat["afterqc_main_summary"]["total_bases"] - self.stat["afterqc_main_summary"]["good_bases"])/float(self.stat["afterqc_main_summary"]["total_bases"])) + "%)") if self.stat["command"]["read2_file"] != None: - self.outputRow(io, "estimated seq error:", str(self.stat["overlap"]["error_rate"]*100) + "%") - self.outputRow(io, "adapter trimmed reads:", self.stat["overlap"]['trimmed_adapter_reads']) - self.outputRow(io, "adapter trimmed bases:", self.stat["overlap"]['trimmed_adapter_bases']) + self.outputRow(io, "estimated seq error:", str(self.stat["afterqc_overlap"]["error_rate"]*100) + "%") + self.outputRow(io, "adapter trimmed reads:", self.stat["afterqc_overlap"]['trimmed_adapter_reads']) + self.outputRow(io, "adapter trimmed bases:", self.stat["afterqc_overlap"]['trimmed_adapter_bases']) self.outputRow(io, "auto trimming", "front:" + str(self.stat["command"]["trim_front"]) + ", tail:" + str(self.stat["command"]["trim_tail"]) + " (use -f0 -t0 to disable)") io.write("
\n") io.write("\n") diff --git a/qualitycontrol.py b/qualitycontrol.py index 2c5e800..d38f2ad 100644 --- a/qualitycontrol.py +++ b/qualitycontrol.py @@ -56,6 +56,20 @@ def __init__(self, qc_sample=1000000, qc_kmer=8): self.baseMeanQual[base] = [0.0 for x in xrange(MAX_LEN)] self.baseTotalQual[base] = [0 for x in xrange(MAX_LEN)] + def squeeze(self): + self.totalQual = self.totalQual[0:self.readLen] + self.totalNum = self.totalNum[0:self.readLen] + self.meanQual = self.meanQual[0:self.readLen] + self.gcPercents = self.gcPercents[0:self.readLen] + self.gcHistogram = self.gcHistogram[0:self.readLen] + self.meanDiscontinuity = self.meanDiscontinuity[0:self.readLen] + self.totalDiscontinuity = self.totalDiscontinuity[0:self.readLen] + for base in ALL_BASES: + self.baseCounts[base] = self.baseCounts[base][0:self.readLen] + self.percents[base] = self.percents[base][0:self.readLen] + self.baseMeanQual[base] = self.baseMeanQual[base][0:self.readLen] + self.baseTotalQual[base] = self.baseTotalQual[base][0:self.readLen] + def statRead(self, read): global WARNED_BZIP2_ERROR seq = read[1]