diff --git a/parsing/__init__.py b/parsing/__init__.py index 4715eb8..4df8aa4 100644 --- a/parsing/__init__.py +++ b/parsing/__init__.py @@ -123,7 +123,7 @@ """ __all__ = ["SpecError", "UnexpectedToken", "Nonterm", "Precedence", "Spec", "Token", "Lr", "Glr", - "ModuleSpecSource"] + "ModuleSpecSource", "Grammar"] from six import print_ from six.moves import range @@ -137,6 +137,7 @@ from parsing.ast import Symbol, Nonterm, Token # noqa from parsing.automaton import Spec from parsing.module_spec import ModuleSpecSource +from parsing.class_spec import Grammar # Exception aliases for legacy code that needs the old names that # shadow builtin exceptions @@ -203,9 +204,10 @@ def reset(self): self._start = None self._stack = [(Epsilon(self), 0)] - def token(self, token): + def token(self, token, tokenSpec=None): """Feed a token to the parser.""" - tokenSpec = self._spec._sym2spec[type(token)] + if tokenSpec is None: + tokenSpec = self._spec._sym2spec[type(token)] self._act(token, tokenSpec) def eoi(self): @@ -230,6 +232,8 @@ def _act(self, sym, symSpec): while True: top = self._stack[-1] if symSpec not in self._spec._action[top[1]]: + for k in self._spec._action[top[1]]: + print("K:", repr(k), id(k)) raise UnexpectedToken("Unexpected token: %r" % sym) actions = self._spec._action[top[1]][symSpec] @@ -275,7 +279,30 @@ def _reduce(self, production): self._stack.append((r, self._spec._goto[top[1]][production.lhs])) def _production(self, production, rhs): - sym = production.lhs.nontermType(self) + sym = production.lhs.nontermType(self, production.lhs) + sym.type = production.lhs.name + if rhs: + try: + first_idx = 0 + last_idx = len(rhs) - 1 + # skip epsilon productions, look into lists (for x* and x+) + while last_idx >= first_idx and not rhs[last_idx]: + last_idx -= 1 + while last_idx >= first_idx and not rhs[first_idx]: + first_idx += 1 + if last_idx >= first_idx: + last_rhs = rhs[last_idx] + if isinstance(last_rhs, list): + last_rhs = last_rhs[-1] + first_rhs = rhs[first_idx] + if isinstance(first_rhs, list): + first_rhs = first_rhs[0] + if first_rhs.range is not None and last_rhs.range is not None: + sym.range = [first_rhs.range[0], last_rhs.range[1]] + else: + sym.range = None + except AttributeError: + pass nRhs = len(rhs) assert nRhs == len(production.rhs) r = production.method(sym, *rhs) @@ -406,14 +433,15 @@ def reset(self): self._paths = [] - def token(self, token): + def token(self, token, tokenSpec=None): """ Feed a token to the parser. """ if self._verbose: print_("%s" % ("-" * 80)) print_("INPUT: %r" % token) - tokenSpec = self._spec._sym2spec[type(token)] + if tokenSpec is None: + tokenSpec = self._spec._sym2spec[type(token)] self._act(token, tokenSpec) if len(self._gss) == 0: raise UnexpectedToken("Unexpected token: %r" % token) diff --git a/parsing/ast.py b/parsing/ast.py index 5b221c3..8f3bd2a 100644 --- a/parsing/ast.py +++ b/parsing/ast.py @@ -6,7 +6,8 @@ """ from parsing.interfaces import is_parser, is_symspec - +from parsing.errors import SpecError +from re import compile as re_compile, escape as re_escape class Symbol(object): def __init__(self, symSpec, parser): @@ -79,9 +80,11 @@ def reduceB(self, id): "%reduce id" """ - def __init__(self, parser): + def __init__(self, parser, symSpec=None): assert is_parser(parser) - Symbol.__init__(self, parser._spec._sym2spec[type(self)], parser) + if symSpec is None: + symSpec = parser._spec._sym2spec[type(self)] + Symbol.__init__(self, symSpec, parser) def merge(self, other): """ @@ -140,7 +143,200 @@ class id(Token): "%token" """ - def __init__(self, parser): - assert is_parser(parser) - Symbol.__init__(self, parser._spec._sym2spec[type(self)], parser) + def __init__(self, parser, spec=None): + assert is_parser(parser), parser + if spec is None: + spec = parser._spec._sym2spec[type(self)] + Symbol.__init__(self, spec, parser) self.__parser = parser + +NOT_SET=object() + + +class ASTToken(Token): + def __init__(self, parser, spec, word, range, val=NOT_SET, **kwargs): + Token.__init__(self, parser, spec) + self.type = spec.name + self.word = word + if val is NOT_SET: + self.val = word + else: + self.val = val + self.range = range + self.__dict__.update(kwargs) + + def __repr__(self): + return '%s[%d-%d,word=%s,val=%r]'%( + self.symSpec.name, self.range[0], self.range[1], + self.word, self.val) + + +class TokenBuilder(object): + """ + carries infos for recognizing and building a token + """ + def __init__(self, token_re, prec='none', convert=None, factory=None, keyword=None, name=None): + self._re = token_re + self._prec = prec + self.convert = convert + if factory is None: + self.factory = ASTToken + else: + self.factory = factory + self.keyword = keyword + self.name = name + + def __hash__(self): + return hash(( + self._re, self.name)) + + def __eq__(self, other): + """ + equality of two TokenBuilder objects, as needed for unpickling + """ + return self.__dict__ == other.__dict__ + + def __ne__(self, other): + return not self == other + + def __call__(self, parser, symSpec, word=None, range=None, **kwargs): + if self.convert is None: + val = word + else: + val = self.convert(word) + return self.factory(parser, symSpec, word, range, val=val, **kwargs) + +def is_token_factory(tokenType): + if isinstance(tokenType, type) and issubclass(tokenType, Token): + return True + if isinstance(tokenType, TokenBuilder): + return True + return False + +def mktoken(name, prec='none', re=None, s=None, tokens=None, keyword=None, + between=None, escape=None, convert=None): + """ + creates a token class (that is then converted into a TokenSpec), i.e. this is + a Token factory factory. + + :param name: the name of the token class + :param prec: the precedence + :param re: a regular expression describing the token + :param s: a fixed string for the token + :param tokens: a string containing a space-separated list of matching tokens + :param keyword: a keyword is a string that is also matched by another RE + :param convert: the function used to construct the semantic function + :return: + """ + token_re = None + if re is not None: + token_re = re + elif s is not None: + token_re = re_escape(s) + elif tokens is not None: + token_re = '(?:%s)'%( + '|'.join([re_escape(tok) for tok in tokens.split()])) + elif between is not None: + if len(between) != 2: + raise SpecError("Need exactly two items for between: %s"%(between,)) + starter, ender = between + not_enders = [] + for i in xrange(len(ender)): + not_enders.append('{}[^{}]'.format( + re_escape(ender[:i]), re_escape(ender[i]))) + token_re = '{}(?:{})*{}'.format( + re_escape(starter), + '|'.join([x for x in not_enders]), + re_escape(ender)) + #print(token_re) + if convert is None: + def my_convert(s): + assert s.startswith(starter) and s.endswith(ender) + return s[len(starter):-len(ender)] + convert = my_convert + else: + token_re = None + + return TokenBuilder(token_re, prec, convert, keyword=keyword, name=name) + +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def print_ast(nonterm, indent=0, attribute_order=None): + assert isinstance(nonterm, Symbol), type(nonterm) + s_indent = ' ' * indent + if isinstance(nonterm, Nonterm): + if hasattr(nonterm, 'type'): + attr_type = nonterm.type + else: + attr_type = None + cls_type = type(nonterm).__name__ + if attr_type != cls_type: + type_expr = '%s%s%s[%s]'%( + bcolors.BOLD, attr_type, bcolors.ENDC, cls_type) + else: + type_expr = '%s%s%s'%(bcolors.BOLD, attr_type, bcolors.ENDC) + if hasattr(nonterm, 'range'): + nt_range = nonterm.range + range_expr = '%d-%d'%(nt_range[0], nt_range[1]) + else: + range_expr = '??-??' + print("%s%s %s"%( + s_indent, type_expr, + range_expr)) + else: + if nonterm.word != nonterm.val: + val_expr = "%r '%s'"%(nonterm.val, nonterm.word) + else: + val_expr = "'%s'"%(nonterm.word,) + if hasattr(nonterm, 'range'): + nt_range = nonterm.range + range_expr = '%d-%d'%(nt_range[0], nt_range[1]) + else: + range_expr = '??-??' + print("%s%s%s%s[%s] %s %s" % ( + s_indent, + bcolors.BOLD, nonterm.type, bcolors.ENDC, + type(nonterm).__name__, + range_expr, + val_expr)) + d = nonterm.__dict__ + def print_attribute(k): + v = getattr(nonterm, k) + if isinstance(v, Symbol): + print('%s %s:' % (s_indent, k)) + print_ast(v, indent + 4, attribute_order) + elif isinstance(v, list): + print('%s %s:' % (s_indent, k)) + for val in v: + if isinstance(val, Symbol): + print_ast(val, indent + 4, attribute_order) + else: + print('%s - %r' % (s_indent, val)) + elif isinstance(v, dict): + print('%s %s:' % (s_indent, k)) + for key in v: + + val = v[key] + if isinstance(val, Symbol): + print('%s [%s]' % (s_indent, key)) + print_ast(val, indent + 6, attribute_order) + else: + print('%s [%s] %r' % (s_indent, key, val)) + + if attribute_order is not None: + for k in attribute_order: + if k in d: + print_attribute(k) + for k in sorted(d.keys()): + if k[0] != '_' and k not in ['type', 'range'] and ( + attribute_order is None or k not in attribute_order): + print_attribute(k) + diff --git a/parsing/automaton.py b/parsing/automaton.py index 0d03412..2538da7 100644 --- a/parsing/automaton.py +++ b/parsing/automaton.py @@ -11,10 +11,13 @@ from parsing.interfaces import is_spec_source from parsing import introspection from parsing import module_spec +from parsing.ast import Nonterm from parsing.grammar import (Precedence, Production, TokenSpec, NontermSpec, SymbolSpec, EndOfInput, eoi, Epsilon, epsilon, NontermStart, Action, ShiftAction, ReduceAction) +RETURN_NONE = object() + class String(list): def __init__(self, args=[]): @@ -409,6 +412,7 @@ def __init__(self, self._precedences = {self._none.name: self._none, self._split.name: self._split} self._nonterms = {} + self._aux_nonterms = {} self._tokens = {eoi.name: eoi, epsilon.name: epsilon} self._sym2spec = {EndOfInput: eoi, Epsilon: epsilon} self._productions = [] @@ -566,7 +570,7 @@ def _prepare(self, adapter, pickleFile, pickleMode, logFile, graphFile): the Parser class for parsing. """ # Get the grammar specification. - if isinstance(adapter, types.ModuleType) or ( + if (isinstance(adapter, types.ModuleType) or isinstance(adapter, list) and isinstance(adapter[0], types.ModuleType)): adapter = module_spec.ModuleSpecSource(adapter) @@ -691,10 +695,12 @@ def _introspect(self, adapter): raise SpecError( "Identical precedence/nonterm names: %s" % v.__doc__) if name in self._tokens: - raise SpecError("Identical token/nonterm names: %s" % - v.__doc__) + raise SpecError("Identical token/nonterm names: %s with %s" % + name, v.__doc__) if name in self._nonterms: - raise SpecError("Duplicate nonterm name: %s" % v.__doc__) + print(self._nonterms) + raise SpecError("Duplicate nonterm name: [%s]%s" % ( + name, v.__doc__)) self._nonterms[name] = nonterm self._sym2spec[v] = nonterm @@ -702,6 +708,48 @@ def _introspect(self, adapter): if not isinstance(self._userStartSym, NontermSpec): raise SpecError("No start symbol specified") + def aux_nonterm(self, name): + if name in self._aux_nonterms: + return self._aux_nonterms[name] + else: + def list_add(self, lst, x): + lst.append(x) + return lst + + original_name = name[:-1] + variant = name[-1] + prec = self._precedences['none'] + nt_class = Nonterm + + module_name = nt_class.__module__ + qualified = '%s.%s' % (module_name, name) + nonterm = NontermSpec(Nonterm, name, qualified, prec) + try: + sym = self._nonterms[original_name] + except KeyError: + sym = self._tokens[original_name] + if variant == '?': + rules_rhs = [[], [sym]] + reducers = [lambda self: RETURN_NONE, lambda self, x: x] + elif variant == '*': + rules_rhs = [[], [nonterm, sym]] + reducers = [lambda self: [], list_add] + elif variant == '+': + rules_rhs = [[sym], [nonterm, sym]] + reducers = [lambda self, x: [x], list_add] + else: + assert False, variant + # do stuff + for i, (rhs, reducer) in enumerate(zip(rules_rhs, reducers)): + prod = Production( + reducer, "%s._%d" % (qualified, i), + prec, nonterm, rhs) + assert prod not in nonterm.productions + nonterm.productions.append(prod) + self._productions.append(prod) + self._aux_nonterms[name] = nonterm + return nonterm + # Resolve all symbolic (named) references. def _references(self, logFile, graphFile): # Build the graph of Precedence relationships. @@ -739,6 +787,11 @@ def _references(self, logFile, graphFile): rhs_terms.append(self._tokens[tok]) elif tok in self._nonterms: rhs.append(self._nonterms[tok]) + elif tok[-1] in '?+*' and ( + tok[:-1] in self._nonterms or + tok[:-1] in self._tokens + ): + rhs.append(self.aux_nonterm(tok)) else: raise SpecError( "Unknown symbol '%s' in reduction " @@ -775,6 +828,7 @@ def _references(self, logFile, graphFile): assert prod not in nonterm.productions nonterm.productions.append(prod) self._productions.append(prod) + self._nonterms.update(self._aux_nonterms) if self._verbose: ntokens = len(self._tokens) - 1 nnonterms = len(self._nonterms) - 1 diff --git a/parsing/class_spec.py b/parsing/class_spec.py new file mode 100644 index 0000000..4250c7c --- /dev/null +++ b/parsing/class_spec.py @@ -0,0 +1,139 @@ +from __future__ import print_function +import re +from parsing.ast import Nonterm, TokenBuilder, is_token_factory +from parsing.grammar import Precedence, TokenSpec, NontermSpec, SpecError +from parsing.automaton import Spec +from parsing.scanner import Scanner +from parsing.ruledsl import interpret_docstring +from types import MethodType, FunctionType + +from six import iteritems +from future.utils import with_metaclass + + +class GrammarMetaclass(type): + def __init__(cls, name, bases, clsdict): + for k, v in iteritems(clsdict): + if hasattr(v, "name"): + if v.name is None: + v.name = k + elif v.name != k: + assert "Names must match: %s / %s" % (v.name, k) + type.__init__(cls, name, bases, clsdict) + cls._nonterms = {} + + +class NontermMetaclass(type): + def __init__(cls, name, bases, clsdict): + more_stuff = {} + if "__doc__" in clsdict: + doc = clsdict["__doc__"] + interpret_docstring(doc, more_stuff, name) + type.__init__(cls, name, bases, clsdict) + gram_cls = cls._grammar_cls + if name in gram_cls._nonterms: + raise SpecError("duplicate Nonterm class %s" % (name,)) + for k, v in iteritems(more_stuff): + # print(k, type(v), isinstance(v, FunctionType)) + setattr(cls, k, v) + # the Nonterm base class is skipped + if not ( + name == "Nonterm" + and len([x for x in list(clsdict.values()) if isinstance(x, MethodType)]) + == 0 + ): + gram_cls._nonterms[name] = cls + + +keyword_re = re.compile("[a-z]+|[A-Z]+") + + +class Grammar(with_metaclass(GrammarMetaclass, object)): + whitespace = "\s+" + + @classmethod + def nonterm_base(cls): + result = NontermMetaclass("Nonterm", (Nonterm,), {"_grammar_cls": cls}) + # register this as part of the grammar's module + result.__module__ = cls.__module__ + return result + + @classmethod + def get_precedences(cls): + result = [] + for k, v in iteritems(cls.__dict__): + if isinstance(v, Precedence): + result.append(v) + return result + + @classmethod + def get_tokens(cls): + if hasattr(cls, "_tokens"): + return cls._tokens + result = [] + for k, v in iteritems(cls.__dict__): + if is_token_factory(v): + if hasattr(v, "_prec"): + prec = v._prec + else: + prec = "none" + result.append(TokenSpec(v, k, prec)) + literal_tokens = set() + for k, v in iteritems(cls._nonterms): + NontermSpec.find_literal_tokens(v, literal_tokens) + literal_by_name = {} + for token in literal_tokens: + if token in literal_by_name: + result.append(literal_by_name[token]) + continue + clean_token = token[1:-1] + if keyword_re.match(clean_token): + keyword = clean_token + else: + keyword = None + builder = TokenBuilder(re.escape(clean_token), keyword=keyword, name=token) + spec = TokenSpec(builder, token, "none") + literal_by_name[token] = spec + result.append(spec) + cls._tokens = result + return result + + @classmethod + def get_nonterminals(cls): + result = [] + startSym = None + print(cls._nonterms) + for k, v in iteritems(cls._nonterms): + nonterm, is_start = NontermSpec.from_class(v, k) + result.append(nonterm) + if is_start: + if startSym is not None: + raise SpecError( + "Only one start non-terminal allowed: %s / %s" + % (v.__doc__, startSym) + ) + else: + startSym = nonterm + return result, startSym + + @classmethod + def get_scanner(cls): + if hasattr(cls, "_scanner"): + scanner = cls._scanner + else: + scanner = Scanner(cls.get_tokens(), cls.whitespace) + cls._scanner = scanner + return scanner + + @classmethod + def feed(cls, string, parser): + cls.get_scanner().scan(string, parser) + + @classmethod + def spec(cls, *args, **kwargs): + if hasattr(cls, "_spec"): + return cls._spec + else: + spec = Spec(cls, *args, **kwargs) + cls._spec = spec + return spec diff --git a/parsing/grammar.py b/parsing/grammar.py index 9c93eca..dbf0f64 100644 --- a/parsing/grammar.py +++ b/parsing/grammar.py @@ -25,7 +25,8 @@ import re import sys -from parsing.ast import Token, Nonterm +import types +from parsing.ast import Token, Nonterm, is_token_factory from parsing.errors import SpecError from parsing import introspection @@ -102,6 +103,28 @@ class P4(Parsing.Precedence): """ assoc_tok_re = re.compile(r'([<>=])([A-Za-z]\w*)') + @classmethod + def create(cls, name=None, precedence='fail', before=None, after=None): + relationships = {} + prec = cls(name, precedence, relationships) + if before is not None: + before.dominators.add(prec) + if after is not None: + prec.dominators.add(after) + return prec + + @classmethod + def left(cls, **kwargs): + return cls.create(precedence='left', **kwargs) + + @classmethod + def right(cls, **kwargs): + return cls.create(precedence='right', **kwargs) + + @classmethod + def nonassoc(cls, **kwargs): + return cls.create(precedence='nonassoc', **kwargs) + def __init__(self, name, assoc, relationships): assert assoc in ["fail", "nonassoc", "left", "right", "split"] assert type(relationships) == dict @@ -163,11 +186,12 @@ def followSetMerge(self, set): class NontermSpec(SymbolSpec): - token_re = re.compile(r'([A-Za-z]\w*)') + token_re = re.compile(r"([A-Za-z]\w*[?+*]?|'[^']+')") precedence_tok_re = re.compile(r'\[([A-Za-z]\w*)\]') def __init__(self, nontermType, name, qualified, prec): - assert issubclass(nontermType, Nonterm) # Add forward decl for Lyken. + # Add forward decl for Lyken. + assert isinstance(nontermType, type) and issubclass(nontermType, Nonterm), nontermType SymbolSpec.__init__(self, name, prec) @@ -188,16 +212,19 @@ def from_class(cls, nt_subclass, name=None, module=None): else: dirtoks = introspection.parse_docstring(nt_subclass.__doc__) is_start = (dirtoks[0] == '%start') - # if dirtoks[0] in SHORTHAND: - # dirtoks = ['%nonterm', name] + if dirtoks[0][0] == '%' and dirtoks[0] not in ['%nonterm', '%start']: + dirtoks = ['%nonterm', name] symbol_name = None prec = None i = 1 while i < len(dirtoks): tok = dirtoks[i] + if tok[0] == '%': + if tok not in ['%start', '%nonterm']: + break m = NontermSpec.precedence_tok_re.match(tok) if m: - if i < len(dirtoks) - 1: + if i < len(dirtoks) - 1 and dirtoks[i+1][0] != '%': raise SpecError("Precedence must come last in " "non-terminal specification: %s" % nt_subclass.__doc__) @@ -221,11 +248,29 @@ def from_class(cls, nt_subclass, name=None, module=None): "%s.%s" % (module_name, name), prec) return nonterm, is_start + @classmethod + def find_literal_tokens(cls, nt_subclass, literal_tokens): + d = nt_subclass.__dict__ + for k in d: + v = d[k] + if (isinstance(v, types.FunctionType) and + isinstance(v.__doc__, str)): + dirtoks = v.__doc__.split(" ") + if dirtoks[0] == "%reduce": + for i in range(1, len(dirtoks)): + tok = dirtoks[i] + m = NontermSpec.token_re.match(tok) + if m and tok[0] == "'": + while tok[-1] in '?*+': + tok = tok[:-1] + #print("find_literal_tokens:", tok, " in ", k) + literal_tokens.add(tok) + # AKA terminal symbol. class TokenSpec(SymbolSpec): def __init__(self, tokenType, name, prec): - assert issubclass(tokenType, Token) + assert is_token_factory(tokenType), repr(tokenType) assert type(name) == str assert isinstance(prec, Precedence) or type(prec) == str diff --git a/parsing/interfaces.py b/parsing/interfaces.py index 3244188..18cfa00 100644 --- a/parsing/interfaces.py +++ b/parsing/interfaces.py @@ -45,3 +45,6 @@ def is_spec_source(source): 'get_precedences', 'get_tokens', 'get_nonterminals' ]): return False + else: + return True + diff --git a/parsing/ruledsl.py b/parsing/ruledsl.py new file mode 100644 index 0000000..9ca6663 --- /dev/null +++ b/parsing/ruledsl.py @@ -0,0 +1,303 @@ +""" +The rule DSL provides a number of shortcuts that +allow the user to define grammar translation rules +more succinctly. +""" + +from __future__ import print_function +import re +from parsing.grammar import NontermSpec, SpecError + +special_re = re.compile('%([a-z]+)($|:)') +first_cap_re = re.compile('(.)([A-Z][a-z]+)') +all_cap_re = re.compile('([a-z0-9])([A-Z])') + +snaked_re = re.compile('_[a-z]') + +rhs_assign_re = re.compile("([A-Za-z]\\w*)([+?]?=)([A-Za-z]\\w*[?+*]?|'[^']+')") +identifier_re = re.compile('[a-zA-Z][0-9a-zA-Z]*$') +symbol_names = { + ';': 'semicolon', + ':': 'colon', + ',': 'comma', + '@': 'atsign' +} + + +def snake_case(name): + """ + converts a name to snake_case + """ + s1 = first_cap_re.sub(r'\1_\2', name) + return all_cap_re.sub(r'\1_\2', s1).lower() + +def camel_case(name): + """ + converts a name to CamelCase + """ + s1 = snaked_re.sub(lambda x: x[1].upper(), name) + return s1[0].upper() + s1[1:] + +postfix = {'?': '_opt', '*': '_opt_list', '+': '_list'} + + +def is_reduce_instr(s): + if s == '%reduce': + return True + elif s.startswith('%reduce:'): + return True + + +def reduce_instr_type(s): + if s == '%reduce': + return None + elif s.startswith('%reduce:'): + return s[8:] + + +class Generator(object): + + def __init__(self, clsdict, name): + self.clsdict = clsdict + self.name = name + self.ruleno = 1 + + def numbered_method(self, prefix='reduce'): + """ + generates a (hopefully) fresh symbol with + the given prefix. + """ + val = self.ruleno + self.ruleno = val + 1 + return '%s_%d' % (prefix, val) + + def add_method(self, lines): + """ + compiles the method given in `lines` to a method + for the class dictionary. + """ + text = '\n'.join(lines) + #print(self.name, "COMPILED") + #print(text) + exec(text, globals(), self.clsdict) + + def compile_reduce(self, lst): + """ + A reduce rule is the equivalent of a normal + reduce rule, but with the added functionality + that AST attributes are derived semi-intelligently + from the RHS + """ + assert is_reduce_instr(lst[0]) + fn_type = reduce_instr_type(lst[0]) + fn_name = self.numbered_method() + rhs_parts = [] + arg_names = [] + for rhs in lst[1:]: + m = rhs_assign_re.match(rhs) + if m: + assigned_name = m.group(1) + assign_op = m.group(2) + rhs = m.group(3) + else: + assigned_name = None + suffix = '' + rhs_parts.append(rhs) + last_char = rhs[-1] + while last_char in '+*?': + suffix += 's' + rhs = rhs[:-1] + last_char = rhs[-1] + if last_char == "'": + arg_name = '_' + else: + # CheeseDeclaration+ => cheese_declarations + arg_name = snake_case(rhs) + suffix + if assigned_name is not None: + arg_names.append(assigned_name) + else: + arg_suffix = 1 + orig_arg_name = arg_name + if arg_name in ['type', 'range']: + arg_name += '_' + while arg_name in arg_names: + arg_suffix += 1 + arg_name = '%s%d' % (orig_arg_name, arg_suffix) + arg_names.append(arg_name) + lst_rhs = ' '.join(['%reduce'] + rhs_parts) + argspec = ', '.join(arg_names) + fillers = { + 'fn_name': fn_name, 'argspec': argspec, + 'lst_rhs': lst_rhs + } + fn_src = ['def %(fn_name)s(self, %(argspec)s):' % fillers, + ' "%(lst_rhs)s"' % fillers] + if fn_type is not None: + fn_src.append(' self.type = "%s"' % (fn_type,)) + for arg_name in arg_names: + if arg_name[0] != '_': + fn_src.append(' self.%(arg_name)s = %(arg_name)s' % { + 'arg_name': arg_name}) + self.add_method(fn_src) + + def compile_choice(self, lst): + """ + The %choice shorthand creates rules with a single symbol + on the right hand side, which gets re-used as the AST for + this node. + """ + for name in lst[1:]: + last_char = name[-1] + if last_char not in "?*+'": + arg_name = snake_case(name) + fn_name = 'r_' + arg_name + elif last_char in '?*+': + arg_name = snake_case(name[:-1]) + fn_name = 'r_' + arg_name + postfix[name[-1]] + elif last_char == "'": + arg_name = '_' + fn_name = 'r_const_' + hex(hash(name)) + if not NontermSpec.token_re.match(name): + raise SpecError("%s is not a valid RHS symbol" % (name,)) + fillers = { + 'fn_name': fn_name, 'arg_name': arg_name, + 'name': name + } + fn_src = [ + x % fillers for x in [ + 'def %(fn_name)s(self, %(arg_name)s):', + ' "%%reduce %(name)s"', + ' return %(arg_name)s']] + self.add_method(fn_src) + + def compile_enum(self, lst): + """ + compiles a %enum directive, which simply sets + self.type to the name of the matched keyword (optionally with a suffix). + As an example, `%enum:Foo 'bar' 'baz'` will recognize `bar` and `baz` + keywords and set the `type` attribute to `BarFoo` and `BazFoo`, + respectively. + """ + if ':' in lst[0]: + suffix = lst[0].split(':')[1] + else: + suffix = camel_case(self.name) + for name in lst[1:]: + if name[0] != "'" or name[-1] != "'": + raise SpecError("%s must be a literal (enclosed in '')" % + (name,)) + kwd_name = name[1:-1] + if "'" in kwd_name or "\\" in kwd_name: + raise SpecError("%s has disallowed characters"%(name,)) + if identifier_re.match(kwd_name): + val_name = kwd_name + elif kwd_name in symbol_names: + val_name = symbol_names[kwd_name] + kwd_name = val_name + else: + val_name = hex(hash(kwd_name)) + fillers = { + 'val_name': val_name, + 'suffix': suffix, + 'kwd_name': camel_case(kwd_name), + 'escaped_name': name + } + fn_src = [ + x % fillers for x in [ + 'def reduce_%(val_name)s(self, _x):', + ' "%%reduce %(escaped_name)s"', + " self.type = '%(kwd_name)s%(suffix)s'" + ] + ] + self.add_method(fn_src) + + def compile_list(self, lst): + """ + The `%list item sep` instruction creates rules for + a list of `item`s separated by `sep`s. If `sep` + is not a keyword or literal symbol, the separators + get included in the resulting list. + """ + if len(lst) != 3: + raise SpecError( + "%list needs item and sep arguments, got {}".format(lst[1:])) + if lst[2].startswith("'"): + # simple list with ignorable separator + fn_src = [ + 'def reduce_single(self, item):', + ' "%%reduce %s"' % (lst[1],), + ' return [item]', + 'def reduce_multiple(self, lst, sep, item):', + ' "%%reduce %s %s %s"' % ( + self.name, + lst[2], lst[1]), + ' return lst + [item]'] + else: + # list with non-ignorable separator + fn_src = [ + 'def reduce_single(self, item):', + ' "%%reduce %s"' % (lst[1],), + ' return [item]', + 'def reduce_multiple(self, lst, sep, item):', + ' "%%reduce %s %s %s"' % ( + self.name, + lst[2], lst[1]), + ' return lst + [sep, item]'] + self.add_method(fn_src) + + def compile_start(self, lst): + """ + %start is used for the start symbol (mandatory). + """ + if len(lst) > 2: + raise SpecError( + "%start directive with extra stuff: {}".format(lst)) + if len(lst) == 2 and lst[1] != self.name: + raise SpecError( + "%start directive uses symbol {}, should be {}".format( + lst[1], self.name)) + + def compile_nonterm(self, lst): + """ + %nonterm is used for normal nonterminals (leftover from + module-based declarations) + """ + if len(lst) >= 2 and lst[1] != self.name: + raise SpecError( + "%nonterm directive uses symbol {}, should be {}".format( + lst[1], self.name)) + + def compile(self, lst): + """ + compiles a sequence of symbols (instruction plus parameters) + into one or multiple methods. + """ + m = special_re.match(lst[0]) + instr = m.group(1) + method = getattr(self, 'compile_' + instr) + method(lst) + + +def interpret_docstring(docstring, clsdict, name): + """ + This function is called with the docstrings of Nonterm classes in order + to allow shorthand notations for a few common patterns in AST construction. + + :param docstring: the docstring to be interpreted + :param clsdict: the class namespace + :param name: the name of the class (for recursive rules such as list + """ + if '%' not in docstring: + return + generator = Generator(clsdict, name) + tokens = docstring.split() + cur_group = [] + for token in tokens: + if token[0] == '%': + if cur_group: + generator.compile(cur_group) + cur_group = [token] + else: + cur_group.append(token) + if cur_group: + generator.compile(cur_group) diff --git a/parsing/scanner.py b/parsing/scanner.py new file mode 100644 index 0000000..60bab5e --- /dev/null +++ b/parsing/scanner.py @@ -0,0 +1,54 @@ +from re import compile as re_compile + +class ScannerError(SyntaxError): + def __init__(self, message, position): + SyntaxError.__init__(self, message) + self.offset = position + +class Scanner(object): + def __init__(self, token_classes, re_whitespace): + self.regexes = [ + (re_compile(tok.tokenType._re), tok.tokenType, tok) + for tok in token_classes + if tok.tokenType._re is not None] + self.keywords = dict([ + (tok.tokenType.keyword, (tok.tokenType, tok)) + for tok in token_classes + if tok.tokenType.keyword is not None + ]) + self.whitespace = re_compile(re_whitespace) + self.classes = dict([(tok.name, tok.tokenType) for tok in token_classes]) + + def scan(self, string, parser, callback=None): + whitespace = self.whitespace + regexes = self.regexes + + idx = 0 + while idx < len(string): + m = whitespace.match(string, idx) + if m: + idx = m.end() + continue + max_idx = 0 + max_cls = None + max_spec = None + max_str = None + for (rgx, cls, tspec) in regexes: + m = rgx.match(string, idx) + if m and m.end() > max_idx: + # print "match %s [%s]"%(m.group(), tspec.name) + max_cls = cls + max_str = m.group() + max_idx = m.end() + max_spec = tspec + if max_str in self.keywords: + max_cls, max_spec = self.keywords[max_str] + if max_idx == 0: + raise ScannerError( + 'Scanning failed at position %s "%s"' % (idx,string[idx]), idx) + token = max_cls(parser, max_spec, max_str, range=(idx, max_idx)) + if callback is None: + parser.token(token, max_spec) + else: + callback(token, max_spec) + idx = max_idx