-
Notifications
You must be signed in to change notification settings - Fork 0
/
bw4posres.py
260 lines (226 loc) · 10.7 KB
/
bw4posres.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#!/usr/bin/env python3.7
"""
================================================================================
File: bw4posres.py
Authors: Mauricio Esguerra
Date: June 23, 2015
Email: [email protected]
Description:
With this code we wish to do various task in one module:
1. Translate pdb to fasta without resorting to import Bio.
2. Align the translated fasta sequence to a Multiple Sequence Alignment (MSA)
and place Marks coming from a network of identified conserved
pair-distances of Venkatakrishnan et al.
clustalo --profile1=GPCR_inactive_BWtags.aln --profile2=mod1.fasta \
-o withbwtags.aln --outfmt=clustal --wrap=1000 --force -v -v -v
3. Translate Marks into properly identified residues in sequence. Notice that
this depends on a dictionary which uses the Ballesteros-Weinstein numbering.
4. From sequence ID. pull the atom-numbers of corresponding c-alphas
in the matched residues.
================================================================================
"""
import os
import subprocess
import settings as s
import logging
logger_bw4posres = logging.getLogger('pymemdyn.bw4posres')
# A larger dictionary of three to one letters can be used.
# For example the dictionary contained at Data/SCOPData.py in a Biopython
# distribution.
# Usually the path where biopython is installed will look something like
# the following path: /lib/python2.7/site-packages/Bio
# For now we are just extending the dictionary to the various possible cases
# for Histidines, that is, HIA, HIC, HID, HIE, HIP, HIQ
protein_letters_3to1 = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
'GLU': 'E', 'GLN': 'Q', 'GLY': 'G', 'HIA': 'H', 'HIC': 'H',
'HID': 'H', 'HIE': 'H', 'HIP': 'H', 'HIQ': 'H', 'HIS': 'H',
'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F',
'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y',
'VAL': 'V'}
bwtags = ['1.46', '1.49', '1.50', '1.53', '1.57', '2.42', '2.43', '2.44', '2.47', '2.50',
'3.34', '3.36', '3.38', '3.40', '3.44', '3.46', '3.47', '3.51', '4.50', '4.53',
'4.57', '5.54', '5.57', '5.60', '6.41', '6.44', '6.47', '6.48', '6.51', '7.38',
'7.39', '7.45', '7.46', '7.47', '7.50', '7.53']
bwpairs = [('1.46', '7.47'), ('1.49', '7.50'), ('1.50', '2.47'), ('1.50', '2.50'),
('1.50', '7.46'), ('1.53', '2.47'), ('1.57', '2.44'), ('2.42', '3.46'),
('2.43', '7.53'), ('2.50', '7.46'), ('3.34', '4.53'), ('3.34', '4.57'),
('3.36', '6.48'), ('3.38', '4.50'), ('3.38', '4.53'), ('3.40', '6.44'),
('3.44', '5.54'), ('3.47', '5.57'), ('3.51', '5.57'), ('3.51', '5.60'),
('5.54', '6.41'), ('6.47', '7.45'), ('6.51', '7.38'), ('6.51', '7.39')]
class Run(object):
"""
A pdb file is given as input to convert into one letter sequence
and then align to curated multiple sequence alignment and then
assign Ballesteros-Weinstein numbering to special positions.
"""
def __init__(self, pdb, **kwargs):
"""
The init method is a kind of constructor, called when an instance
of the class is created. The method serves to initialize what you
want to do with the object.
"""
self.pdb = pdb
self.own_dir = kwargs.get("own_dir") or ""
self.clustal_bin = s.CLUSTAL_BIN
self.repo_dir = s.TEMPLATES_DIR
self.loggerbw = logging.getLogger('pymemdyn.bw4posres.Run')
def pdb2fas(self):
"""
From pdb file convert to fasta sequence format without the use of
dependencies such as BioPython. This pdb to fasta translator
checks for the existance of c-alpha residues and it is
based on their 3-letter sequence id.
"""
fastaseq = open(os.path.join(self.pdb.split(".")[0] + ".fasta"), "w")
self.loggerbw.debug('writing now: '+os.path.join(self.pdb.split(".")[0] + ".fasta"))
fastaseq.write(">")
fastaseq.write("{0}\n".format(self.pdb))
result = []
for line in open(self.pdb, "r"):
atoms = [a for a in line if line[0:6] == "ATOM " and line[13:16] == "CA "]
seqnam = ''.join(atoms[17:20])
if seqnam != '':
seq = protein_letters_3to1[seqnam]
result.append(seq)
lines = []
numcol = 70
resultasstr = ''.join(result)
for i in range(0, len(resultasstr), numcol):
lines.append(resultasstr[i:i+numcol])
fastaseq.write("{0}".format("\n".join(lines)))
fastaseq.write("\n")
fastaseq.close()
def clustalalign(self):
"""
Align the produced fasta sequence with clustalw to assign
Ballesteros-Weinstein marks.
"""
profile1 = self.repo_dir + "/GPCR_inactive_BWtags.aln"
profile2 = os.path.join(self.pdb.split(".")[0] + ".fasta")
bwtagged = os.path.join(self.pdb.split(".")[0] + "_bw" + ".aln")
command = [
self.clustal_bin,
"-profile1=" + profile1,
"-profile2=" + profile2,
"-outfile=" + bwtagged,
"-output=clustal"]
proc = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
proc.communicate()
return True
def getcalphas(self):
"""
Pulls out the atom numbers of c-alpha atoms. Restraints are
placed on c-alpha atoms.
"""
calphaspdb = open(os.path.join(self.pdb.split(".")[0] + "_CA.pdb"), "w")
for line in open(self.pdb, "r"):
atoms = [a for a in line if line[0:6] == "ATOM " and line[13:16] == "CA "]
calphaspdb.write("{0}".format("".join(atoms)))
def makedisre(self):
"""
Creates a disre.itp file with atom-pair id's to be restrained
using and NMR-style Heaviside function based on
Ballesteros-Weinstein tagging.
"""
disre = open('disre.itp', 'w')
readdisre = open(self.repo_dir + "/disre.itp", 'r')
disre.write("; This file provides distance restraints based on an online example\n")
disre.write("; by David Van der Spoel for alpha-helices.\n")
disre.write("; Notice that for now we are using a fixed value for up2 of 1.2.\n")
disre.write("; low is the average pair distance for the set of 19 inactive structures\n")
disre.write("; of Ramakrishnan et al. minus one sd, up1 is the average plus one sd.\n")
disre.write("; ai aj type index type' low up1 up2 fac\n")
disre.write("[ distance_restraints ]\n")
# This part pulls out the two last lines from the clustalw alignment
# and puts each in a list.
# list1 has the Ballesteros-Weinstein positions for the restraints.
# list2 has the final alignment of the sequence, corresponding to the
# pdb that was given.
list1 = []
list2 = []
bwtagged = os.path.join(self.pdb.split(".")[0] + "_bw" + ".aln")
for line in open(bwtagged, "r"):
fields = line.split()
try:
if fields[0] == 'bw':
list1.append(fields[1])
if fields[0] == self.pdb:
list2.append(fields[1])
except IndexError:
continue
list1s = ''.join(list1)
list1l = list(list1s)
list2s = ''.join(list2)
list2l = list(list2s)
# The two lists are combined into a list of tuples using the zip builtin
# and then the dashes (sequence gaps) are parsed out of the
# corresponding well aligned sequence to be able to count without gaps.
# Finally the residue numbers corresponding to the A mask are returned.
ziplist = list(zip(list1l, list2l))
nodash = []
for index, string in enumerate(ziplist):
if ziplist[index][1].isalpha():
nodash.append(ziplist[index])
resid = []
for index, string in enumerate(nodash):
if nodash[index][0] == 'A':
resid.append(index+1)
# To map residue id's to c-alpha atom numbers I open the parsed
# pdb which contains only c-alphas and look for the existence of
# the located residue numbers in the resid list.
caid = []
onlyca = open(os.path.join(self.pdb.split(".")[0] + "_CA.pdb"), "r")
firstline = onlyca.readline()
offset = int(int(firstline[22:26]) - 1)
renumbered = [x + offset for x in resid]
logger_bw4posres.debug('renumbered: {}'.format(renumbered))
# Take care of the possible offset of sequence numbers not starting
# at 1.
for line in onlyca:
if int(''.join(line[22:26])) in renumbered:
caid.append(line[7:11])
# These two lists must have the same dimension:
logger_bw4posres.debug("The following two lists must have the same dimension:")
logger_bw4posres.debug('{} \t {}'.format(len(bwtags), len(caid)))
# The mapping (rosetta-stone) between BW id numbers and c-alpha atom
# numbers is made in this list.
bw2calpha = list(zip(bwtags,caid))
bwatom1 = []
bwatom2 = []
for index, value in enumerate(bwpairs):
bwatom1.append(value[0])
bwatom2.append(value[1])
caatom1 = []
for i in range(0, 24):
for j in range (0,36):
# TODO: make sure that a pdb with gaps will also go through.
# right now a pdb with gaps on the sequential numbering of
# residues will have a wrong mapping and not even an error message
# is issued.
# Issue reported by user Laurens Kooijman
if bwatom1[i] == bw2calpha[j][0]:
caatom1.append(bw2calpha[j][1])
caatom2 = []
for i in range(0, 24):
for j in range (0,36):
if bwatom2[i] == bw2calpha[j][0]:
caatom2.append(bw2calpha[j][1])
for i in range(0,24):
logger_bw4posres.debug('{}, {}, {}'.format(caatom1[i], caatom2[i], bwpairs[i]))
# This part finally writes to the disre.itp file all the information it needs
# for the atom pairs after they have been mapped from Ballesteros-Weinstein
# into c-alpha atom numbers in the original pdb given as input.
for index, string in enumerate(readdisre):
fields = string.split()
fields.insert(0, caatom2[index])
fields.insert(0, caatom1[index])
#It would be better to have first 9 fields be float
#instead of str.
fields.insert(9, '\n')
disre.write('{0}'.format(' '.join(fields)))
disre.close()
return True