Skip to content

Commit

Permalink
output more information to JSON
Browse files Browse the repository at this point in the history
To support JSON based plotting
  • Loading branch information
sfchen committed Jul 21, 2017
1 parent 2040953 commit 58d7f87
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 28 deletions.
2 changes: 1 addition & 1 deletion after.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import copy
from util import *

AFTERQC_VERSION = "0.9.3"
AFTERQC_VERSION = "0.9.5"

def parseCommand():
usage = "Automatic Filtering, Trimming, Error Removing and Quality Control for Illumina fastq data \n\nSimplest usage:\ncd to the folder containing your fastq data, run <python after.py>"
Expand Down
74 changes: 55 additions & 19 deletions preprocesser.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,38 +688,74 @@ def run(self):
reporter.addFigure('Good reads and bad reads after filtering', self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS, 'filter_stat'), 'filter_stat', "")
#self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png"))

#squeeze qc data for JSON output
self.r1qc_prefilter.squeeze()
self.r1qc_postfilter.squeeze()
if self.options.read2_file != None:
self.r2qc_prefilter.squeeze()
self.r2qc_postfilter.squeeze()

stat={}
# stat["options"]=self.options
stat["summary"]=result
stat["afterqc_main_summary"]=result
stat["command"]=makeDict(self.options)
stat["kmer_content"] = {}
stat["kmer_content"]["read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10]
stat["kmer_content"]["read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10]

# output more data in JSON file for offline plotting directly from JSON
stat["base_quality"] = {}
stat["base_quality"]["read1_prefilter"] = self.r1qc_prefilter.baseMeanQual
stat["base_quality"]["read1_postfilter"] = self.r1qc_postfilter.baseMeanQual
stat["mean_quality"] = {}
stat["mean_quality"]["read1_prefilter"] = self.r1qc_prefilter.meanQual
stat["mean_quality"]["read1_postfilter"] = self.r1qc_postfilter.meanQual
stat["base_content"] = {}
stat["base_content"]["read1_prefilter"] = self.r1qc_prefilter.percents
stat["base_content"]["read1_postfilter"] = self.r1qc_postfilter.percents
stat["gc_content"] = {}
stat["gc_content"]["read1_prefilter"] = self.r1qc_prefilter.gcPercents
stat["gc_content"]["read1_postfilter"] = self.r1qc_postfilter.gcPercents

if self.options.read2_file != None:
stat["kmer_content"]["read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10]
stat["kmer_content"]["read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10]
stat["overlap"]={}
stat["overlap"]['overlapped_pairs']=OVERLAPPED


stat["base_quality"]["read2_prefilter"] = self.r2qc_prefilter.baseMeanQual
stat["base_quality"]["read2_postfilter"] = self.r2qc_postfilter.baseMeanQual

stat["mean_quality"]["read2_prefilter"] = self.r2qc_prefilter.meanQual
stat["mean_quality"]["read2_postfilter"] = self.r2qc_postfilter.meanQual

stat["base_content"]["read2_prefilter"] = self.r2qc_prefilter.percents
stat["base_content"]["read2_postfilter"] = self.r2qc_postfilter.percents

stat["gc_content"]["read2_prefilter"] = self.r2qc_prefilter.gcPercents
stat["gc_content"]["read2_postfilter"] = self.r2qc_postfilter.gcPercents

stat["afterqc_overlap"]={}
stat["afterqc_overlap"]['overlapped_pairs']=OVERLAPPED
if OVERLAPPED > 0:
stat["overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED)
stat["afterqc_overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED)
else:
stat["overlap"]['average_overlap_length']=0.0
stat["overlap"]['bad_mismatch_reads']=BADMISMATCH
stat["overlap"]['bad_diff']=BADDIFF
stat["overlap"]['bad_indel_reads']=BADINDEL
stat["overlap"]['corrected_reads']=READ_CORRECTED
stat["overlap"]['corrected_bases']=BASE_CORRECTED
stat["overlap"]['skipped_correction_bases']=BASE_SKIPPED_CORRECTION
stat["overlap"]['zero_qual_masked']=BASE_ZERO_QUAL_MASKED
stat["overlap"]['zero_qual_skipped']=BASE_ZERO_QUAL_MASKED
stat["overlap"]['trimmed_adapter_bases']=TRIMMED_ADAPTER_BASE
stat["overlap"]['trimmed_adapter_reads']=TRIMMED_ADAPTER_READ
stat["afterqc_overlap"]['average_overlap_length']=0.0
stat["afterqc_overlap"]['bad_mismatch_reads']=BADMISMATCH
stat["afterqc_overlap"]['bad_diff']=BADDIFF
stat["afterqc_overlap"]['bad_indel_reads']=BADINDEL
stat["afterqc_overlap"]['corrected_reads']=READ_CORRECTED
stat["afterqc_overlap"]['corrected_bases']=BASE_CORRECTED
stat["afterqc_overlap"]['skipped_correction_bases']=BASE_SKIPPED_CORRECTION
stat["afterqc_overlap"]['zero_qual_masked']=BASE_ZERO_QUAL_MASKED
stat["afterqc_overlap"]['zero_qual_skipped']=BASE_ZERO_QUAL_MASKED
stat["afterqc_overlap"]['trimmed_adapter_bases']=TRIMMED_ADAPTER_BASE
stat["afterqc_overlap"]['trimmed_adapter_reads']=TRIMMED_ADAPTER_READ
if OVERLAP_BASE_SUM > 0:
stat["overlap"]['error_rate']=float(OVERLAP_BASE_ERR)/float(OVERLAP_BASE_SUM)
stat["afterqc_overlap"]['error_rate']=float(OVERLAP_BASE_ERR)/float(OVERLAP_BASE_SUM)
else:
stat["overlap"]['error_rate']=0.0
stat["overlap"]['error_matrix']=OVERLAP_ERR_MATRIX
stat["overlap"]['edit_distance_histogram']=distance_histgram[0:10]
stat["afterqc_overlap"]['error_rate']=0.0
stat["afterqc_overlap"]['error_matrix']=OVERLAP_ERR_MATRIX
stat["afterqc_overlap"]['edit_distance_histogram']=distance_histgram[0:10]
reporter.addFigure('Sequence error distribution', self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX, 'error_matrix'), 'error_matrix', "")
reporter.addFigure('Overlap length distribution', self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen, TOTAL_READS, 'overlap_stat'), 'overlap_stat', "")
#self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png"))
Expand Down
16 changes: 8 additions & 8 deletions qcreporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def outputRow(self, io, k, v):
io.write("<tr><td class='col1'>" + str(k) + "</td><td class='col2'>" + str(v) + "</td></tr>\n")

def getSequencing(self):
ret = str(self.stat["summary"]["readlen"])
ret = str(self.stat["afterqc_main_summary"]["readlen"])
if self.stat["command"]["read2_file"] != None:
ret = "2*" + ret + " pair end"
else:
Expand All @@ -91,14 +91,14 @@ def outputSummary(self, io):
io.write("<table class='summary-table'>\n")
self.outputRow(io, "AfterQC Version:", self.version)
self.outputRow(io, "sequencing:", self.getSequencing())
self.outputRow(io, "total reads:", self.stat["summary"]["total_reads"])
self.outputRow(io, "filtered out reads:", str(self.stat["summary"]["bad_reads"]) + " <font color='#aaaaaa'>(" + str(100.0 * float(self.stat["summary"]["bad_reads"])/float(self.stat["summary"]["total_reads"])) + "%)</font>")
self.outputRow(io, "total bases:", self.stat["summary"]["total_bases"])
self.outputRow(io, "filtered out bases:", str(self.stat["summary"]["total_bases"] - self.stat["summary"]["good_bases"]) + " <font color='#aaaaaa'>(" + str(100.0 * float(self.stat["summary"]["total_bases"] - self.stat["summary"]["good_bases"])/float(self.stat["summary"]["total_bases"])) + "%)</font>")
self.outputRow(io, "total reads:", self.stat["afterqc_main_summary"]["total_reads"])
self.outputRow(io, "filtered out reads:", str(self.stat["afterqc_main_summary"]["bad_reads"]) + " <font color='#aaaaaa'>(" + str(100.0 * float(self.stat["afterqc_main_summary"]["bad_reads"])/float(self.stat["afterqc_main_summary"]["total_reads"])) + "%)</font>")
self.outputRow(io, "total bases:", self.stat["afterqc_main_summary"]["total_bases"])
self.outputRow(io, "filtered out bases:", str(self.stat["afterqc_main_summary"]["total_bases"] - self.stat["afterqc_main_summary"]["good_bases"]) + " <font color='#aaaaaa'>(" + str(100.0 * float(self.stat["afterqc_main_summary"]["total_bases"] - self.stat["afterqc_main_summary"]["good_bases"])/float(self.stat["afterqc_main_summary"]["total_bases"])) + "%)</font>")
if self.stat["command"]["read2_file"] != None:
self.outputRow(io, "estimated seq error:", str(self.stat["overlap"]["error_rate"]*100) + "%")
self.outputRow(io, "adapter trimmed reads:", self.stat["overlap"]['trimmed_adapter_reads'])
self.outputRow(io, "adapter trimmed bases:", self.stat["overlap"]['trimmed_adapter_bases'])
self.outputRow(io, "estimated seq error:", str(self.stat["afterqc_overlap"]["error_rate"]*100) + "%")
self.outputRow(io, "adapter trimmed reads:", self.stat["afterqc_overlap"]['trimmed_adapter_reads'])
self.outputRow(io, "adapter trimmed bases:", self.stat["afterqc_overlap"]['trimmed_adapter_bases'])
self.outputRow(io, "auto trimming", "front:" + str(self.stat["command"]["trim_front"]) + ", tail:" + str(self.stat["command"]["trim_tail"]) + " (use <font color='#aaaaaa'>-f0 -t0</font> to disable)")
io.write("</table>\n")
io.write("</div>\n")
Expand Down
14 changes: 14 additions & 0 deletions qualitycontrol.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ def __init__(self, qc_sample=1000000, qc_kmer=8):
self.baseMeanQual[base] = [0.0 for x in xrange(MAX_LEN)]
self.baseTotalQual[base] = [0 for x in xrange(MAX_LEN)]

def squeeze(self):
self.totalQual = self.totalQual[0:self.readLen]
self.totalNum = self.totalNum[0:self.readLen]
self.meanQual = self.meanQual[0:self.readLen]
self.gcPercents = self.gcPercents[0:self.readLen]
self.gcHistogram = self.gcHistogram[0:self.readLen]
self.meanDiscontinuity = self.meanDiscontinuity[0:self.readLen]
self.totalDiscontinuity = self.totalDiscontinuity[0:self.readLen]
for base in ALL_BASES:
self.baseCounts[base] = self.baseCounts[base][0:self.readLen]
self.percents[base] = self.percents[base][0:self.readLen]
self.baseMeanQual[base] = self.baseMeanQual[base][0:self.readLen]
self.baseTotalQual[base] = self.baseTotalQual[base][0:self.readLen]

def statRead(self, read):
global WARNED_BZIP2_ERROR
seq = read[1]
Expand Down

0 comments on commit 58d7f87

Please sign in to comment.