-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_fastaq_pipe.py
71 lines (54 loc) · 2.17 KB
/
merge_fastaq_pipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# simple merger for 2 fastq files
# written for python 2.7
#
# please make sure input files are sorted
# and have the same length
# and same file format
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import argparse
import gzip
import sys
parser = argparse.ArgumentParser("merges 2 fastq files into one by concatenating the sequences and qualitysequences\n make sure both files have the same format!")
parser.add_argument("fileA", type=str, help="fastq file A")
parser.add_argument("fileB", type=str, help="fastq file B")
parser.add_argument("--out", default=None, help="name outputfile - otherwise output to std-out. If file name ends with .gz file will be compressed automatically.")
#parser.add_argument("--gzip-input-files", default=False, action="store_true", help="set flag if files are compressed")
parser.add_argument("--format", default="fastq", type=str, help="specify your fastq format - default is fastq")
args = parser.parse_args()
fq_format = args.format
# read files
try:
if args.fileA.endswith(".gz"): #automatic gz scanning args.gzip_input_files:
handleA = gzip.open(args.fileA,"rt")
handleB = gzip.open(args.fileB,"rt")
fileA = SeqIO.parse(handleA,fq_format)
fileB = SeqIO.parse(handleB,fq_format)
else:
fileA = SeqIO.parse(args.fileA,fq_format)
fileB = SeqIO.parse(args.fileB,fq_format)
except Exception as e:
print("something went wrong reading files \n"+e.message)
sys.exit(0)
# merge and output
if args.out is None:
output = sys.stdout
elif args.out.endswith(".gz"):
output = gzip.open(args.out,"wt")
else:
output = open(args.out,"w")
while True:
try:
recA = next(fileA)
recB = next(fileB)
if recA.id != recB.id:
sys.stderr.write("found non matching ids, skipping, make sure files are sorted\n")
continue
out_record = SeqRecord(recA.seq+recB.seq, id = recA.id, name = recA.name, description = recA.description+"|"+recB.description)
out_record.letter_annotations["phred_quality"] = recA.letter_annotations["phred_quality"]+recB.letter_annotations["phred_quality"]
output.write(out_record.format(fq_format))
except StopIteration as e:
break
except Exception as e:
sys.stderr.write("Something went wrong.\n"+e.message)
sys.exit()