-
Notifications
You must be signed in to change notification settings - Fork 22
/
transition.py
357 lines (308 loc) · 13.5 KB
/
transition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
import argparse
import networkx as nx
import matplotlib.pyplot as plt
import random
import numpy as np
import math
from scipy import stats
'''
first version: unweighted, undirected network
use edge random walk to generate edge transction matrix based on EM algorithm
'''
def parse_args():
'''
Parses the node2vec arguments.
'''
parser = argparse.ArgumentParser(description="Run edge transition matrix.")
parser.add_argument('--input', nargs='?', default='data.csv',
help='Input graph path')
parser.add_argument('--output', nargs='?', default='matrix.txt',
help='store transition matrix')
parser.add_argument('--type_size', type=int, default=3,
help='Number of edge types. Default is 3.')
parser.add_argument('--em_iteration', default=5, type=int,
help='EM iterations for transition matrix')
parser.add_argument('--e_step', default=3, type=int,
help='E step in the EM algorithm: there are four expectation metrics')
parser.add_argument('--dimensions', type=int, default=10,
help='Number of dimensions. Default is 10.')
parser.add_argument('--walk-length', type=int, default=3,
help='Length of walk per source. Default is 3.')
parser.add_argument('--num-walks', type=int, default=2,
help='Number of walks per source. Default is 2.')
parser.add_argument('--window-size', type=int, default=5,
help='Context size for optimization. Default is 5.')
parser.add_argument('--iter', default=10, type=int,
help='Number of epochs in SGD')
parser.add_argument('--workers', type=int, default=8,
help='Number of parallel workers. Default is 8.')
parser.add_argument('--p', type=float, default=1,
help='Return hyperparameter. Default is 1.')
parser.add_argument('--q', type=float, default=1,
help='Inout hyperparameter. Default is 1.')
#dest='weighted' means the arg parameter name is weighted.
# There is only one parameter: args.weighted
parser.add_argument('--weighted', dest='weighted', action='store_true',
help='Boolean specifying (un)weighted. Default is unweighted.')
parser.add_argument('--unweighted', dest='weighted', action='store_false')
parser.set_defaults(weighted=False)
parser.add_argument('--directed', dest='directed', action='store_true',
help='Graph is (un)directed. Default is undirected.')
parser.add_argument('--undirected', dest='directed', action='store_false')
parser.set_defaults(directed=False)
return parser.parse_args()
def read_graph(edgeList,weighted=False, directed=False):
'''
Reads the input network in networkx.
'''
if weighted:
G = nx.read_edgelist(edgeList, nodetype=str, data=(('type',int),('weight',float),('id',int)), create_using=nx.DiGraph())
else:
G = nx.read_edgelist(edgeList, nodetype=str,data=(('type',int),('id',int)), create_using=nx.DiGraph())
for edge in G.edges():
G[edge[0]][edge[1]]['weight'] = 1.0
if not directed:
G = G.to_undirected()
# print (G.edges(data = True))
return G
def initialize_edge_type_matrix(type_num):
'''
initialize a transition matrix with equal values
'''
initialized_val = 1.0/(type_num*type_num)
matrix = [ [ initialized_val for i in range(type_num) ] for j in range(type_num) ]
return matrix
def simulate_walks(G, num_walks, walk_length,matrix,is_directed,p,q):
'''
generate random walk paths constrainted by transition matrix
'''
walks = []
links = list(G.edges(data = True))
print 'Walk iteration:'
for walk_iter in range(num_walks):
print str(walk_iter+1), '/', str(num_walks)
random.shuffle(links)
count = 1000
for link in links:
# print "chosen link id: ",link[2]['id']
walks.append(edge2vec_walk(G, walk_length, link,matrix,is_directed,p,q))
count = count - 1
if count == 0 and len(links)>1000:#control the pairwise list length
break
return walks
def edge2vec_walk(G, walk_length, start_link,matrix,is_directed,p,q):
'''
return a random walk path
'''
# print "start link: ", type(start_link), start_link
walk = [start_link]
result = [str(start_link[2]['type'])]
# print "result ",result
while len(walk) < walk_length:# here we may need to consider some dead end issues
cur = walk[-1]
start_node = cur[0]
end_node = cur[1]
cur_edge_type = cur[2]['type']
'''
find the direction of link to go. If a node degree is 1, it means if go that direction, there is no other links to go further
if the link are the only link for both nodes, the link will have no neighbours (need to have teleportation later)
'''
'''
consider the hub nodes and reduce the hub influence
'''
if is_directed: # directed graph has random walk direction already
direction_node = end_node
left_node = start_node
else:# for undirected graph, first consider the random walk direction by choosing the start node
start_direction = 1.0/G.degree(start_node)
end_direction = 1.0/G.degree(end_node)
prob = start_direction/(start_direction+end_direction)
# print "start node: ", start_node, " degree: ", G.degree(start_node)
# print "end node: ", end_node, " degree: ", G.degree(end_node)
# print cur[0], cur[1]
rand = np.random.rand()
# print "random number ",rand
# print "probability for start node: ",prob
if prob >= rand:
# print "yes"
direction_node = start_node
left_node = end_node
else:
direction_node = end_node
left_node = start_node
# print "directed node: ",direction_node
# print "left_node node: ",left_node
'''
here to choose which link goes to. There are three conditions for the link based on node distance. 0,1,2
'''
neighbors = G.neighbors(direction_node)
# print G.has_edge(1,3)
# print G.has_edge(3,1)
'''
calculate sum of distance, with +1 normalization
'''
distance_sum = 0
for neighbor in neighbors:
# print "neighbors:", neighbor
neighbor_link = G[direction_node][neighbor]#get candidate link's type
# print "neighbor_link: ",neighbor_link
neighbor_link_type = neighbor_link['type']
# print "neighbor_link_type: ",neighbor_link_type
neighbor_link_weight = neighbor_link['weight']
trans_weight = matrix[cur_edge_type-1][neighbor_link_type-1]
if G.has_edge(neighbor,left_node) or G.has_edge(left_node,neighbor):
distance_sum += trans_weight*neighbor_link_weight/p
elif neighbor == left_node: #decide whether it can random walk back
distance_sum += trans_weight*neighbor_link_weight
else:
distance_sum += trans_weight*neighbor_link_weight/q
'''
pick up the next step link
'''
# random.shuffle(neighbors)
rand = np.random.rand() * distance_sum
threshold = 0
# next_link_end_node = 0
neighbors2 = G.neighbors(direction_node)
for neighbor in neighbors2:
# print "current threshold: ", threshold
neighbor_link = G[direction_node][neighbor]#get candidate link's type
neighbor_link_type = neighbor_link['type']
neighbor_link_weight = neighbor_link['weight']
trans_weight = matrix[cur_edge_type-1][neighbor_link_type-1]
if G.has_edge(neighbor,left_node) or G.has_edge(left_node,neighbor):
threshold += trans_weight*neighbor_link_weight/p
if threshold >= rand:
next_link_end_node = neighbor
break;
elif neighbor == left_node:
threshold += trans_weight*neighbor_link_weight
if threshold >= rand:
next_link_end_node = neighbor
break;
else:
threshold += trans_weight*neighbor_link_weight/q
if threshold >= rand:
next_link_end_node = neighbor
break;
# print "distance_sum: ",distance_sum
# print "rand: ", rand, " threshold: ", threshold
# print "next_link_end_node: ",next_link_end_node
if distance_sum > 0: # the direction_node has next_link_end_node
next_link = G[direction_node][next_link_end_node]
# next_link = G.get_edge_data(direction_node,next_link_end_node)
next_link_tuple = tuple()
next_link_tuple += (direction_node,)
next_link_tuple += (next_link_end_node,)
next_link_tuple += (next_link,)
# print type(next_link_tuple)
# print next_link_tuple
walk.append(next_link_tuple)
result.append(str(next_link_tuple[2]['type']))
# print "walk length: ",len(walk),walk
else:
break
# print "path: ",result
return result
def update_trans_matrix(walks,type_size,evaluation_metric):
'''
E step, update transition matrix
'''
#here need to use list of list to store all edge type numbers and use KL divergence to update
matrix = [ [ 0 for i in range(type_size) ] for j in range(type_size) ]
repo = dict()
for i in range(type_size):#initialize empty list to hold edge type vectors
repo[i] = []
for walk in walks:
curr_repo = dict()#store each type number in current walk
for edge in walk:
edge_id = int(edge) - 1
if edge_id in curr_repo:
curr_repo[edge_id] = curr_repo[edge_id]+1
else:
curr_repo[edge_id] = 1
for i in range(type_size):
# print "curr_repo[i]: ",curr_repo[i],type(curr_repo[i])
if i in curr_repo:
repo[i].append(curr_repo[i])
else:
repo[i].append(0)
for i in range(type_size):
# print "repo ",i, ": ",repo[i],type(repo[i])
for j in range(type_size):
if evaluation_metric == 1:
sim_score = wilcoxon_test(repo[i],repo[j])
matrix[i][j] = sim_score
# print "each pair of edge type sim_score: ", sim_score
elif evaluation_metric == 2:
sim_score = entroy_test(repo[i],repo[j])
matrix[i][j] = sim_score
elif evaluation_metric == 3:
sim_score = spearmanr_test(repo[i],repo[j])
matrix[i][j] = sim_score
elif evaluation_metric == 4:
sim_score = pearsonr_test(repo[i],repo[j])
matrix[i][j] = sim_score
else:
raise ValueError('not correct evaluation metric! You need to choose from 1-4')
return matrix
'''
different ways to calculate correlation between edge-types
'''
#pairwised judgement
def wilcoxon_test(v1,v2):# original metric: the smaller the more similar
result = stats.wilcoxon(v1, v2).statistic
if result != result:
result = 0
return 1/(math.sqrt(result)+1)
def entroy_test(v1,v2):#original metric: the smaller the more similar
result = stats.entropy(v1,v2)
result = stats.wilcoxon(v1, v2).statistic
if result != result:
result = 0
return result
def spearmanr_test(v1,v2):#original metric: the larger the more similar
result = stats.mstats.spearmanr(v1,v2).correlation
result = stats.wilcoxon(v1, v2).statistic
if result != result:
result = -1
return sigmoid(result)
def pearsonr_test(v1,v2):#original metric: the larger the more similar
result = stats.mstats.pearsonr(v1,v2)[0]
result = stats.wilcoxon(v1, v2).statistic
if result != result:
result = -1
return sigmoid(result)
def cos_test(v1,v2):
return 1 - spatial.distance.cosine(v1, v2)
def sigmoid(x):
return 1 / (1 + math.exp(-x))
def standardization(x):
return (x+1)/2
def relu(x):
return (abs(x) + x) / 2
def main(args):
# print "------begin to write graph---------"
# generate_graph_write_edgelist(args.m1,args.m2,args.input)
print "begin to initialize transition matrix"
trans_matrix = initialize_edge_type_matrix(args.type_size)
print trans_matrix
print "------begin to read graph---------"
G = read_graph(args.input,args.weighted,args.directed)
# print G.edges(data=True)
# nodes = list(G.nodes)
# print G.number_of_edges(),nodes,[n for n in G.neighbors('3')]
# # G=nx.barbell_graph(17,1)
# # draw_graph(G)
print "------begin to simulate walk---------"
for i in range(args.em_iteration):
walks = simulate_walks(G,args.num_walks, args.walk_length,trans_matrix,args.directed,args.p,args.q)#M step
print str(i), "th iteration for Upating transition matrix!"
trans_matrix = update_trans_matrix(walks,args.type_size,args.e_step)#E step
print "trans_matrix: ",trans_matrix
# print walks
print "------finish!---------"
np.savetxt(args.output, trans_matrix)
if __name__ == "__main__":
args = parse_args()
main(args)