-
Notifications
You must be signed in to change notification settings - Fork 0
/
dramv_amgs_table.py
executable file
·67 lines (52 loc) · 2.98 KB
/
dramv_amgs_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
"""
Requires python >= 3.9 and pandas
"""
import argparse
import pandas as pd
import sys
import os.path
parser = argparse.ArgumentParser()
default_column = 'pfam_hits'
parser.add_argument("pipelinedir", help="directory containing pipeline outputs - including sample directories")
parser.add_argument("samplelist", help="file containing list of samples - should be names of sample directories that you want to include in the summary table - one per line")
parser.add_argument("-d", "--dramv", help="which dramv directory do you want to use the annotations from. e.g. dramv or amgs (default: %(default)s)", default="amgs")
parser.add_argument("-r", "--verse", help="which verse directory do you want to use the abundances from. (default: %(default)s)", default="amgs/abund_amgs")
parser.add_argument("-v", "--value", help="what value from abundance file to fill in the table (default: %(default)s)", choices = ['count', 'cpm', 'rpk'], default = 'count')
parser.add_argument("-s", "--suffix", help="suffix for file with abundances (default: %(default)s)", default = "_amgs.count.gene.cpm.txt")
parser.add_argument("--debug", help="print debug messages", action='store_true')
args = parser.parse_args()
if args.debug: print(args, file=sys.stderr)
def read_abund(abundfile, amgtable, value, sample):
"""abundfile: abundance filename - output of verse
annotable: corresponding dramv annotations table from read_annotate
value: which value column from abundfile to include
sample: sample name associated with file
"""
a = pd.read_csv(abundfile, sep='\t')
a = a.merge(amgtable, how='inner', left_on = 'gene', right_on = 'gene')
a = a[list(amgtable.columns) + [value]]
return(a)
def read_amg(amgfile, sample):
a = pd.read_csv(amgfile, sep='\t')
## get just the relevant columns
column = ['gene', 'gene_id', 'gene_description']
a = a[column]
## add sample column
a['sample'] = sample
return(a)
## read in samples
with open(args.samplelist, 'r') as f:
samples = f.read().splitlines()
amgdf = read_amg(os.path.join(args.pipelinedir, samples[0], args.dramv, "dramv-distill", "amg_summary.tsv"), samples[0])
## make list of sample abundance files and amg summary files
abundlist = {sample: os.path.join(args.pipelinedir, sample, args.verse, sample + args.suffix) for sample in samples}
amglist = {sample: os.path.join(args.pipelinedir, sample, args.dramv, "dramv-distill", "amg_summary.tsv") for sample in samples}
## read in amg and abund files and make individual sample tables
amgtabs = {sample: read_amg(amglist[sample], sample) for sample in samples}
abundtabs = [read_abund(abundlist[sample], amgtabs[sample], args.value, sample) for sample in samples]
## concatenate abundance tables
longtab = pd.concat(abundtabs)
## pivot/cast table into wide gene_id table
finaltab = pd.pivot_table(longtab, values = args.value, index = ['gene_id', 'gene_description'], columns = 'sample', fill_value=0, aggfunc='sum')
finaltab.to_csv(sys.stdout, sep = "\t")