-
Notifications
You must be signed in to change notification settings - Fork 2
/
parser.py
375 lines (348 loc) · 14.2 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# coding: utf-8
from __future__ import print_function
from collections import defaultdict
from Queue import Queue
from fa import DFA, DFANode
from util import bnf_reader, frozen_items
from lex import Lex
from util import colorful
class LRConflict(Exception):
def __init__(self):
pass
class LRParser(object):
"""
LR语法分析器, 接受token流, 分析出语法树 (所以tokens应该不是parser的一部分)
grammar: defaultdict(list), 解析器语法
analysis_table: LR1 分析表
"""
def __init__(self):
self.grammar = defaultdict(list)
self.terminators = []
self.non_terminators = []
self._first = defaultdict(set)
self.lr_dfa = None
self.lr_table = None
self.idx_items = [] # DFA节点项目集索引
@property
def alphabet(self):
return self.terminators + self.non_terminators
def read_grammar(self, filename):
"""
语法读入
"""
grammar = self.grammar
grammar.clear()
self.terminators, self.non_terminators = [], []
g_in, g_out = defaultdict(int), defaultdict(int)
for l_hand, r_hand in bnf_reader(filename):
symbol = l_hand[1:-1]
__exp__ = [r[1:-1] for r in r_hand.split()]
for exp in __exp__:
g_in[exp] += 1
g_out[symbol] += 1
grammar[symbol].append(__exp__)
for k, v in g_in.iteritems():
if not g_out[k]:
self.terminators.append(k)
self.non_terminators.extend([k for k, v in g_out.iteritems() if v])
# print g_in
# print g_out
def get_eps(self):
"""
返回能推导出 ep 的非终结符
"""
grammar = defaultdict(list)
for symbol, exps in self.grammar.iteritems():
grammar[symbol] = [[e for e in exp] for exp in exps]
eps = dict()
vt = {key: 1 for key in self.terminators}
vt.update({key: 0 for key in self.non_terminators})
for symbol, exps in grammar.iteritems():
if filter(lambda exp: len(exp) == 1 and exp[0] == "ep", exps):
# 存在 symbol ::= ε , 则标记 True 并删除所有产生式
eps[symbol] = True
grammar[symbol] = []
continue
grammar[symbol] = filter(
lambda exp: not sum(vt[e] for e in exp), exps) # 删除所有含有终结符的产生式
if not len(grammar[symbol]):
eps[symbol] = False
while sum(len(exps) for exps in grammar.itervalues()):
# 只要产生式没有被删光就继续
for symbol, exps in grammar.iteritems():
if not exps:
continue
for i in range(len(exps)):
exps[i] = filter(
lambda e: not(not vt[e] and eps.get(e, False)), exps[i])
if filter(lambda exp: not exp, exps):
grammar[symbol] = []
eps[symbol] = True
continue
grammar[symbol] = filter(
lambda exp: sum(eps.get(e, True) for e in exp) == len(exp), exps)
# 如果右部存在为非空的非终结符, 则删除该产生式
if not len(grammar[symbol]):
eps[symbol] = False
eps.update(ep=True)
eps.update({key: False for key in self.terminators if key != "ep"})
return eps
def calc_first(self):
"""
计算出字母表里所有元素的first集合
"""
first = self._first
first.clear()
eps = self.get_eps()
g_in, g_out = defaultdict(int), defaultdict(int)
grap = defaultdict(list) # First集合的邻接表
terminators, non_terminators = self.terminators, self.non_terminators
grammar = self.grammar
for symbol, exps in grammar.iteritems():
for exp in exps:
for i in range(len(exp)):
if i == sum(eps.get(e, False) for e in exp[:i]):
if exp[i] == "ep":
continue
grap[exp[i]].append(symbol)
g_in[symbol] += 1
g_out[exp[i]] += 1
for ter in self.terminators:
first[ter].add(ter)
stack = [ter for ter in terminators]
while len(stack):
# 拓扑排序构造first集合, 暂时不管空弧
top = stack.pop()
for symbol in grap[top]:
first[symbol].update(first[top])
g_in[symbol] -= 1
if g_in[symbol] == 0:
stack.append(symbol)
for symbol in first.keys():
# 处理空弧
if eps.get(symbol, False):
first[symbol].add("ep")
first["ep"] = {"ep"}
def first(self, symbols):
"""
:return: 返回first集.
"""
if isinstance(symbols, str):
return self._first[symbols]
if isinstance(symbols, set):
symbols = list(symbols)
if isinstance(symbols, list):
res = set()
for i in range(len(symbols)):
res.update(self.first(symbols[i]))
if symbols[i] in self.terminators and symbols[i] != "ep":
break
if sum(1 for s in symbols[:i+1]
if "ep" in self._first[s]) != i+1:
break
if "ep" in res \
and sum(1 for s in symbols
if "ep" in self._first[s]) != len(symbols):
res.remove("ep")
return res
def closure(self, item):
"""
item: 四元组, 表示一个项目("start", (), ("A", ), {'$'}), 即 start ::= 丶 'A', $
[A -> α•Bγ, a] -> B in VN , b in First(γa), 都有 ε -> [B -> ▪β, b]
:return: 这个item所在的项目集
"""
que = Queue()
que.put(item)
self._first['$'] = '$'
grammar = self.grammar
res = []
vis = defaultdict(set)
while not que.empty():
top = que.get()
vis[(top[0], top[1], top[2])].update(top[3])
# res.append(top)
if not top[2]:
# 这是一个规约项目, 例如 start -> A•
continue
B, gama = top[2][0], top[2][1:] or ("ep", )
if B not in self.non_terminators:
# B 是非终结符才产生转换
continue
for exp in grammar[B]:
_exp = tuple(exp)
if _exp == ("ep", ):
_exp = tuple()
for a in top[3]:
fst = self.first(list(gama) + [a])
if not fst.issubset(vis[(B, tuple(), _exp)]):
# 如果曾经出现过, 不再入队
vis[(B, tuple(), _exp)].update(fst)
que.put((B, tuple(), _exp, fst))
return [core + (head, ) for core, head in vis.iteritems()]
def compile(self):
"""
利用 self.grammar 编译出dfa, 并构造分析表
:return:
"""
self.calc_first()
alloc = 0
grammar = self.grammar
dfa = DFA()
que = Queue()
dfa.start = DFANode(id=alloc,
lr_items=self.closure(("start", tuple(),
tuple(grammar["start"][0]),
{'$'})))
self.idx_items = [dfa.start]
que.put(dfa.start.lr_items)
vis = dict()
vis[frozen_items(dfa.start.lr_items)] = dfa.start
while not que.empty():
lr_items = que.get()
# if frozen_items(lr_items) in vis:
# continue
dfa_node = vis[frozen_items(lr_items)]
# print 'u_items:'
# print lr_items
tmp = defaultdict(list)
for item in lr_items:
if item[2]:
u_item = (item[0], item[1] + item[2][:1],
item[2][1:], item[3])
tmp[item[2][0]].append(u_item)
# 可能该状态有两个以上项目可以通过 item[2][0] 转换到新项目, 而新的项目集应该是他们的合集
for l_hand, items in tmp.iteritems():
vitem = defaultdict(set)
for item in items:
u_items = self.closure(item)
for u_item in u_items:
vitem[u_item[:-1]].update(u_item[3])
next_items = [core + (head, )
for core, head in vitem.iteritems()]
if frozen_items(next_items) not in vis:
que.put(next_items)
alloc += 1
dfa_node.next[l_hand] = DFANode(id=alloc,
lr_items=next_items)
self.idx_items.append(dfa_node.next[l_hand]) # 插入新节点
vis[frozen_items(next_items)] = dfa_node.next[l_hand]
else:
dfa_node.next[l_hand] = vis[frozen_items(next_items)]
# dfa.draw("LR", show_meta=["lr_items"], generate_id=False)
self.lr_dfa = dfa
# DFA 构造完成
# 构造分析表
lr_table = defaultdict(dict)
que = Queue()
que.put(dfa.start)
vis = dict()
while not que.empty():
tmp = que.get()
if tmp in vis:
continue
vis[tmp] = 1
for item in tmp.lr_items:
if item[2]:
# 移进状态
if item[2][0] in lr_table[tmp.id]:
if lr_table[tmp.id][item[2][0]]['action'] != 'shift':
print(colorful('移近规约冲突', 'Red'))
raise LRConflict()
elif lr_table[tmp.id][item[2][0]]['next'] != \
tmp.next[item[2][0]].id:
print(colorful('移近移近冲突', 'Red'))
raise LRConflict()
lr_table[tmp.id][item[2][0]] = \
dict(action="shift", next=tmp.next[item[2][0]].id)
else:
# 规约状态
for a in item[3]:
if a in lr_table[tmp.id]:
if lr_table[tmp.id][a]['action'] != 'reduce':
print(colorful('移近规约冲突', 'Red'))
raise LRConflict()
elif lr_table[tmp.id][a]['grammar'] != item:
print(colorful('规约规约冲突', 'Red'))
raise LRConflict()
lr_table[tmp.id][a] = dict(action="reduce",
grammar=item)
for next_node in tmp.next.values():
que.put(next_node)
self.lr_table = lr_table
return dfa
def parse(self, tokens, handler):
"""
接受token流, 并进行规约
"""
# print '分析 tokens 流: ', tokens
lr_table = self.lr_table
stat_stack = [0, ]
symbol_stack = ['$', ]
input_stack = [('$', '$')] + tokens[::-1]
# print input_stack
while not (stat_stack == [0] and [x[0] for x in input_stack] == ['$', 'start'] and symbol_stack == ['$']):
top_stat = stat_stack[-1]
top_token_type = input_stack[-1][0] # 取 token_type
# top_token_type = input_stack.pop()[0] # 取token
action = lr_table[top_stat][top_token_type]
if action["action"] == "shift":
stat_stack.append(action["next"])
if input_stack[-1][1] != 'no-sense':
handler.shift(input_stack[-1])
symbol_stack.append(input_stack.pop()[0])
elif action["action"] == "reduce":
grammar = action["grammar"]
handler.reduce(action["grammar"])
# print 'reduce, 利用规则 %s -> %s' % (grammar[0], ' '.join(grammar[1]))
for _ in range(len(grammar[1])):
stat_stack.pop()
symbol_stack.pop()
input_stack.append((grammar[0], 'no-sense'))
print('规约成功, 符合语法规则!')
def show_dfa(self):
"""
输出编译好的dfa, dot格式
"""
que = Queue()
vis = dict()
que.put(self.lr_dfa.start)
while not que.empty():
tmp = que.get()
if tmp in vis:
continue
vis[tmp] = 1
lr_items = tmp.lr_items
tmp.meta["items"] = '\n'.join(['%s -> %s ` %s, %s' % (item[0], ''.join(item[1]), ''.join(item[2]), '/'.join(item[3])) for item in lr_items])
for a in tmp.next.keys():
que.put(tmp.next[a])
# self.lr_dfa.draw("LR", show_meta=["items"])
if __name__ == "__main__":
import readline
if 'libedit' in readline.__doc__:
readline.parse_and_bind("bind ^I rl_complete")
else:
readline.parse_and_bind("tab: complete")
class ParseHandler(object):
def __init__(self):
self.ast = None
self.exps = []
def shift(self, token):
print(colorful('移近 {0} {1}'.format(token[0], token[1]), "Cyan"))
def reduce(self, grammar):
"""
规约动作, grammar为规约用的产身世
"""
print('利用规则' + colorful('%s -> %s' % (grammar[0], ' '.join(grammar[1])), "Green"))
# from util import colorful
lex = Lex()
lex.keyword = ['lambda', '[', ']', 'let', 'define', 'if', 'cond', 'or', 'and', '(', ')']
lex.read_lex('regular_lex.txt')
lex.compile(grammar_type="regular")
print(colorful('词法编译完成...', 'Yellow'))
parser = LRParser()
parser.read_grammar('schepy_grammar.txt')
parser.compile()
print(colorful('语法编译完成...', 'Yellow'))
parser.show_dfa()
while True:
parser.parse(lex.lex(raw_input(), ignore=["limit"]), ParseHandler())