From 8b5a41ad31ecb4310eca4b0831ab9bc3916a897f Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Thu, 14 Dec 2023 14:37:20 +0800 Subject: [PATCH 1/2] Separate lexer and parser from cfront --- src/lexer.c | 743 ++++++++++++++++++++++++++++++++++++ src/main.c | 7 +- src/{cfront.c => parser.c} | 747 +------------------------------------ 3 files changed, 749 insertions(+), 748 deletions(-) create mode 100644 src/lexer.c rename src/{cfront.c => parser.c} (81%) diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 00000000..9182045a --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,743 @@ +/* lexer tokens */ +typedef enum { + T_start, /* FIXME: it was intended to start the state machine. */ + T_numeric, + T_identifier, + T_comma, /* , */ + T_string, /* null-terminated string */ + T_char, + T_open_bracket, /* ( */ + T_close_bracket, /* ) */ + T_open_curly, /* { */ + T_close_curly, /* } */ + T_open_square, /* [ */ + T_close_square, /* ] */ + T_asterisk, /* '*' */ + T_divide, /* / */ + T_mod, /* % */ + T_bit_or, /* | */ + T_bit_xor, /* ^ */ + T_bit_not, /* ~ */ + T_log_and, /* && */ + T_log_or, /* || */ + T_log_not, /* ! */ + T_lt, /* < */ + T_gt, /* > */ + T_le, /* <= */ + T_ge, /* >= */ + T_lshift, /* << */ + T_rshift, /* >> */ + T_dot, /* . */ + T_arrow, /* -> */ + T_plus, /* + */ + T_minus, /* - */ + T_minuseq, /* -= */ + T_pluseq, /* += */ + T_oreq, /* |= */ + T_andeq, /* &= */ + T_eq, /* == */ + T_noteq, /* != */ + T_assign, /* = */ + T_increment, /* ++ */ + T_decrement, /* -- */ + T_question, /* ? */ + T_colon, /* : */ + T_semicolon, /* ; */ + T_eof, /* end-of-file (EOF) */ + T_ampersand, /* & */ + T_return, + T_if, + T_else, + T_while, + T_for, + T_do, + T_define, + T_undef, + T_error, + T_include, + T_typedef, + T_enum, + T_struct, + T_sizeof, + T_elipsis, /* ... */ + T_switch, + T_case, + T_break, + T_default, + T_continue +} token_t; + +char token_str[MAX_TOKEN_LEN]; +token_t next_token; +char next_char; +int skip_newline = 1; + +int preproc_match; + +/* Allow replacing identifiers with alias value if alias exists. This is + * disabled in certain cases, e.g. #undef. + */ +int preproc_aliasing = 1; + +/* Point to the first character after where the macro has been called. It is + * needed when returning from the macro body. + */ +int macro_return_idx; + +int is_whitespace(char c) +{ + return (c == ' ' || c == '\t'); +} + +char peek_char(int offset); + +/* is it backslash-newline? */ +int is_linebreak(char c) +{ + return c == '\\' && peek_char(1) == '\n'; +} + +int is_newline(char c) +{ + return (c == '\r' || c == '\n'); +} + +/* is it alphabet, number or '_'? */ +int is_alnum(char c) +{ + return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || (c == '_')); +} + +int is_digit(char c) +{ + return (c >= '0' && c <= '9') ? 1 : 0; +} + +int is_hex(char c) +{ + return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || c == 'x' || + (c >= 'A' && c <= 'F')); +} + +int is_numeric(char buffer[]) +{ + int i, hex, size = strlen(buffer); + + if (size > 2) + hex = (buffer[0] == '0' && buffer[1] == 'x') ? 1 : 0; + else + hex = 0; + + for (i = 0; i < size; i++) { + if (hex && (is_hex(buffer[i]) == 0)) + return 0; + if (!hex && (is_digit(buffer[i]) == 0)) + return 0; + } + return 1; +} + +void skip_whitespace() +{ + while (1) { + if (is_linebreak(next_char)) { + source_idx += 2; + next_char = SOURCE[source_idx]; + continue; + } + if (is_whitespace(next_char) || + (skip_newline && is_newline(next_char))) { + next_char = SOURCE[++source_idx]; + continue; + } + break; + } +} + +char read_char(int is_skip_space) +{ + next_char = SOURCE[++source_idx]; + if (is_skip_space == 1) + skip_whitespace(); + return next_char; +} + +/* get alias name from defined() directive + * i.e., get __arm__ from defined(__arm__) + */ +void read_alias_name_from_defined(char *alias_name, char *src) +{ + int i; + + src = src + 8; /* skip defined( */ + i = 0; + while (src[i] != ')') { + alias_name[i] = src[i]; + i++; + } + alias_name[i] = 0; +} + +char peek_char(int offset) +{ + return SOURCE[source_idx + offset]; +} + +void if_elif_skip_lines() +{ + char peek_c; + int i; + + do { + skip_whitespace(); + i = 0; + do { + token_str[i++] = next_char; + } while (read_char(0) != '\n'); + token_str[i] = 0; + read_char(1); + peek_c = peek_char(1); + } while (next_char != '#' || (next_char == '#' && peek_c == 'd')); + skip_whitespace(); +} + +void ifdef_else_skip_lines() +{ + int i; + + do { + skip_whitespace(); + i = 0; + do { + token_str[i++] = next_char; + } while (read_char(0) != '\n'); + token_str[i] = 0; + } while (strcmp(token_str, "#else") && strcmp(token_str, "#endif")); + skip_whitespace(); +} + +/* check alias defined or not */ +void chk_def(int defined) +{ + char *alias = NULL; + char alias_name[MAX_TOKEN_LEN]; + + if (defined) { + read_alias_name_from_defined(alias_name, token_str); + alias = find_alias(alias_name); + } else + alias = find_alias(token_str); + + if (alias) + preproc_match = 1; +} + +token_t get_next_token() +{ + token_str[0] = 0; + + /* partial preprocessor */ + if (next_char == '#') { + int i = 0; + + do { + token_str[i++] = next_char; + } while (is_alnum(read_char(0))); + token_str[i] = 0; + skip_whitespace(); + + if (!strcmp(token_str, "#include")) { + do { + token_str[i++] = next_char; + } while (read_char(0) != '\n'); + skip_whitespace(); + return T_include; + } + if (!strcmp(token_str, "#define")) { + skip_whitespace(); + return T_define; + } + if (!strcmp(token_str, "#undef")) { + skip_whitespace(); + return T_undef; + } + if (!strcmp(token_str, "#error")) { + skip_whitespace(); + return T_error; + } + if (!strcmp(token_str, "#if")) { + preproc_match = 0; + i = 0; + do { + token_str[i++] = next_char; + } while (read_char(0) != '\n'); + token_str[i] = 0; + + if (!strncmp(token_str, "defined", 7)) { + chk_def(1); + if (preproc_match) { + skip_whitespace(); + return get_next_token(); + } + + /* skip lines until #elif or #else or #endif */ + if_elif_skip_lines(); + return get_next_token(); + } + } + if (!strcmp(token_str, "#elif")) { + if (preproc_match) { + do { + skip_whitespace(); + i = 0; + do { + token_str[i++] = next_char; + } while (read_char(0) != '\n'); + token_str[i] = 0; + } while (strcmp(token_str, "#endif")); + skip_whitespace(); + return get_next_token(); + } + + i = 0; + do { + token_str[i++] = next_char; + } while (read_char(0) != '\n'); + token_str[i] = 0; + + if (!strncmp(token_str, "defined", 7)) { + chk_def(1); + if (preproc_match) { + skip_whitespace(); + return get_next_token(); + } + /* skip lines until #elif or #else or #endif */ + if_elif_skip_lines(); + return get_next_token(); + } + } + if (!strcmp(token_str, "#ifdef")) { + preproc_match = 0; + i = 0; + do { + token_str[i++] = next_char; + } while (read_char(0) != '\n'); + token_str[i] = 0; + chk_def(0); + if (preproc_match) { + skip_whitespace(); + return get_next_token(); + } + /* skip lines until #else or #endif */ + ifdef_else_skip_lines(); + return get_next_token(); + } + if (!strcmp(token_str, "#else")) { + /* reach here has 2 possible cases: + * 1. reach #ifdef preprocessor directive + * 2. conditional expression in #elif is false + */ + if (!preproc_match) { + skip_whitespace(); + return get_next_token(); + } + /* skip lines until #else or #endif */ + ifdef_else_skip_lines(); + return get_next_token(); + } + if (!strcmp(token_str, "#endif")) { + preproc_match = 0; + skip_whitespace(); + return get_next_token(); + } + error("Unknown directive"); + } + + /* C-style comments */ + if (next_char == '/') { + read_char(0); + if (next_char == '*') { + /* in a comment, skip until end */ + do { + read_char(0); + if (next_char == '*') { + read_char(0); + if (next_char == '/') { + read_char(1); + return get_next_token(); + } + } + } while (next_char); + } else { + /* single '/', predict divide */ + if (next_char == ' ') + read_char(1); + return T_divide; + } + /* TODO: check invalid cases */ + error("Unexpected '/'"); + } + + if (is_digit(next_char)) { + int i = 0; + do { + token_str[i++] = next_char; + } while (is_hex(read_char(0))); + token_str[i] = 0; + skip_whitespace(); + return T_numeric; + } + if (next_char == '(') { + read_char(1); + return T_open_bracket; + } + if (next_char == ')') { + read_char(1); + return T_close_bracket; + } + if (next_char == '{') { + read_char(1); + return T_open_curly; + } + if (next_char == '}') { + read_char(1); + return T_close_curly; + } + if (next_char == '[') { + read_char(1); + return T_open_square; + } + if (next_char == ']') { + read_char(1); + return T_close_square; + } + if (next_char == ',') { + read_char(1); + return T_comma; + } + if (next_char == '^') { + read_char(1); + return T_bit_xor; + } + if (next_char == '~') { + read_char(1); + return T_bit_not; + } + if (next_char == '"') { + int i = 0; + int special = 0; + + while ((read_char(0) != '"') || special) { + if ((i > 0) && (token_str[i - 1] == '\\')) { + if (next_char == 'n') + token_str[i - 1] = '\n'; + else if (next_char == '"') + token_str[i - 1] = '"'; + else if (next_char == 'r') + token_str[i - 1] = '\r'; + else if (next_char == '\'') + token_str[i - 1] = '\''; + else if (next_char == 't') + token_str[i - 1] = '\t'; + else if (next_char == '\\') + token_str[i - 1] = '\\'; + else + abort(); + } else { + token_str[i++] = next_char; + } + if (next_char == '\\') + special = 1; + else + special = 0; + } + token_str[i] = 0; + read_char(1); + return T_string; + } + if (next_char == '\'') { + read_char(0); + if (next_char == '\\') { + read_char(0); + if (next_char == 'n') + token_str[0] = '\n'; + else if (next_char == 'r') + token_str[0] = '\r'; + else if (next_char == '\'') + token_str[0] = '\''; + else if (next_char == '"') + token_str[0] = '"'; + else if (next_char == 't') + token_str[0] = '\t'; + else if (next_char == '\\') + token_str[0] = '\\'; + else + abort(); + } else { + token_str[0] = next_char; + } + token_str[1] = 0; + if (read_char(0) != '\'') + abort(); + read_char(1); + return T_char; + } + if (next_char == '*') { + read_char(1); + return T_asterisk; + } + if (next_char == '&') { + read_char(0); + if (next_char == '&') { + read_char(1); + return T_log_and; + }; + if (next_char == '=') { + read_char(1); + return T_andeq; + } + skip_whitespace(); + return T_ampersand; + } + if (next_char == '|') { + read_char(0); + if (next_char == '|') { + read_char(1); + return T_log_or; + }; + if (next_char == '=') { + read_char(1); + return T_oreq; + } + skip_whitespace(); + return T_bit_or; + } + if (next_char == '<') { + read_char(0); + if (next_char == '=') { + read_char(1); + return T_le; + }; + if (next_char == '<') { + read_char(1); + return T_lshift; + }; + skip_whitespace(); + return T_lt; + } + if (next_char == '%') { + read_char(1); + return T_mod; + } + if (next_char == '>') { + read_char(0); + if (next_char == '=') { + read_char(1); + return T_ge; + }; + if (next_char == '>') { + read_char(1); + return T_rshift; + }; + skip_whitespace(); + return T_gt; + } + if (next_char == '!') { + read_char(0); + if (next_char == '=') { + read_char(1); + return T_noteq; + } + skip_whitespace(); + return T_log_not; + } + if (next_char == '.') { + read_char(0); + if (next_char == '.') { + read_char(0); + if (next_char == '.') { + read_char(1); + return T_elipsis; + } + abort(); + } + skip_whitespace(); + return T_dot; + } + if (next_char == '-') { + read_char(0); + if (next_char == '>') { + read_char(1); + return T_arrow; + } + if (next_char == '-') { + read_char(1); + return T_decrement; + } + if (next_char == '=') { + read_char(1); + return T_minuseq; + } + skip_whitespace(); + return T_minus; + } + if (next_char == '+') { + read_char(0); + if (next_char == '+') { + read_char(1); + return T_increment; + } + if (next_char == '=') { + read_char(1); + return T_pluseq; + } + skip_whitespace(); + return T_plus; + } + if (next_char == ';') { + read_char(1); + return T_semicolon; + } + if (next_char == '?') { + read_char(1); + return T_question; + } + if (next_char == ':') { + read_char(1); + return T_colon; + } + if (next_char == '=') { + read_char(0); + if (next_char == '=') { + read_char(1); + return T_eq; + } + skip_whitespace(); + return T_assign; + } + + /* end of file */ + /* "FIXME: The signedness of 'char' in the C programming language is indeed + * implementation-specific. For example, gcc for Arm treats 'char' as + * unsigned, while gcc for x86(-64) treats 'char' as signed. The warning + * below is raised in gcc for Arm: + * warning: comparison is always false due to limited range of data type + * [-Wtype-limits] + */ + if ((next_char == 0) || (next_char == -1)) + return T_eof; + + if (is_alnum(next_char)) { + char *alias; + int i = 0; + do { + token_str[i++] = next_char; + } while (is_alnum(read_char(0))); + token_str[i] = 0; + skip_whitespace(); + + if (!strcmp(token_str, "if")) + return T_if; + if (!strcmp(token_str, "while")) + return T_while; + if (!strcmp(token_str, "for")) + return T_for; + if (!strcmp(token_str, "do")) + return T_do; + if (!strcmp(token_str, "else")) + return T_else; + if (!strcmp(token_str, "return")) + return T_return; + if (!strcmp(token_str, "typedef")) + return T_typedef; + if (!strcmp(token_str, "enum")) + return T_enum; + if (!strcmp(token_str, "struct")) + return T_struct; + if (!strcmp(token_str, "sizeof")) + return T_sizeof; + if (!strcmp(token_str, "switch")) + return T_switch; + if (!strcmp(token_str, "case")) + return T_case; + if (!strcmp(token_str, "break")) + return T_break; + if (!strcmp(token_str, "default")) + return T_default; + if (!strcmp(token_str, "continue")) + return T_continue; + + if (preproc_aliasing) { + alias = find_alias(token_str); + if (alias) { + token_t t = is_numeric(alias) ? T_numeric : T_string; + strcpy(token_str, alias); + return t; + } + } + + return T_identifier; + } + + /* + * This only happens when parsing a macro. Move to the token after the + * macro definition or return to where the macro has been called. + */ + if (next_char == '\n') { + if (macro_return_idx) { + source_idx = macro_return_idx; + next_char = SOURCE[source_idx]; + } else + next_char = read_char(1); + return get_next_token(); + } + + error("Unrecognized input"); + return T_eof; +} + +/* Skip the content. We only need the index where the macro body begins. */ +void skip_macro_body() +{ + while (!is_newline(next_char)) + next_token = get_next_token(); + + skip_newline = 1; + next_token = get_next_token(); +} + +int lex_accept(token_t token) +{ + if (next_token == token) { + next_token = get_next_token(); + return 1; + } + return 0; +} + +int lex_peek(token_t token, char *value) +{ + if (next_token == token) { + if (!value) + return 1; + strcpy(value, token_str); + return 1; + } + return 0; +} + +void lex_ident(token_t token, char *value) +{ + if (next_token != token) + error("Unexpected token"); + strcpy(value, token_str); + next_token = get_next_token(); +} + +void lex_expect(token_t token) +{ + if (next_token != token) + error("Unexpected token"); + next_token = get_next_token(); +} diff --git a/src/main.c b/src/main.c index c3c4324f..16d64354 100644 --- a/src/main.c +++ b/src/main.c @@ -21,8 +21,11 @@ /* ELF manipulation */ #include "elf.c" -/* C language front-end */ -#include "cfront.c" +/* C language lexical analyzer */ +#include "lexer.c" + +/* C language syntactic analyzer */ +#include "parser.c" /* architecture-independent middle-end */ #include "ssa.c" diff --git a/src/cfront.c b/src/parser.c similarity index 81% rename from src/cfront.c rename to src/parser.c index 29b40c91..69514251 100644 --- a/src/cfront.c +++ b/src/parser.c @@ -5,148 +5,7 @@ * file "LICENSE" for information on usage and redistribution of this file. */ -/* C language front-end */ - -int is_whitespace(char c) -{ - return (c == ' ' || c == '\t'); -} - -char peek_char(int offset); - -/* is it backslash-newline? */ -int is_linebreak(char c) -{ - return c == '\\' && peek_char(1) == '\n'; -} - -int is_newline(char c) -{ - return (c == '\r' || c == '\n'); -} - -/* is it alphabet, number or '_'? */ -int is_alnum(char c) -{ - return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || (c == '_')); -} - -int is_digit(char c) -{ - return (c >= '0' && c <= '9') ? 1 : 0; -} - -int is_hex(char c) -{ - return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || c == 'x' || - (c >= 'A' && c <= 'F')); -} - -int is_numeric(char buffer[]) -{ - int i, hex, size = strlen(buffer); - - if (size > 2) - hex = (buffer[0] == '0' && buffer[1] == 'x') ? 1 : 0; - else - hex = 0; - - for (i = 0; i < size; i++) { - if (hex && (is_hex(buffer[i]) == 0)) - return 0; - if (!hex && (is_digit(buffer[i]) == 0)) - return 0; - } - return 1; -} - -/* lexer tokens */ -typedef enum { - T_start, /* FIXME: it was intended to start the state machine. */ - T_numeric, - T_identifier, - T_comma, /* , */ - T_string, /* null-terminated string */ - T_char, - T_open_bracket, /* ( */ - T_close_bracket, /* ) */ - T_open_curly, /* { */ - T_close_curly, /* } */ - T_open_square, /* [ */ - T_close_square, /* ] */ - T_asterisk, /* '*' */ - T_divide, /* / */ - T_mod, /* % */ - T_bit_or, /* | */ - T_bit_xor, /* ^ */ - T_bit_not, /* ~ */ - T_log_and, /* && */ - T_log_or, /* || */ - T_log_not, /* ! */ - T_lt, /* < */ - T_gt, /* > */ - T_le, /* <= */ - T_ge, /* >= */ - T_lshift, /* << */ - T_rshift, /* >> */ - T_dot, /* . */ - T_arrow, /* -> */ - T_plus, /* + */ - T_minus, /* - */ - T_minuseq, /* -= */ - T_pluseq, /* += */ - T_oreq, /* |= */ - T_andeq, /* &= */ - T_eq, /* == */ - T_noteq, /* != */ - T_assign, /* = */ - T_increment, /* ++ */ - T_decrement, /* -- */ - T_question, /* ? */ - T_colon, /* : */ - T_semicolon, /* ; */ - T_eof, /* end-of-file (EOF) */ - T_ampersand, /* & */ - T_return, - T_if, - T_else, - T_while, - T_for, - T_do, - T_define, - T_undef, - T_error, - T_include, - T_typedef, - T_enum, - T_struct, - T_sizeof, - T_elipsis, /* ... */ - T_switch, - T_case, - T_break, - T_default, - T_continue -} token_t; - -char token_str[MAX_TOKEN_LEN]; -token_t next_token; -char next_char; -int skip_newline = 1; - -int preproc_match; - -/* Allow replacing identifiers with alias value if alias exists. This is - * disabled in certain cases, e.g. #undef. - */ -int preproc_aliasing = 1; - -/* Point to the first character after where the macro has been called. It is - * needed when returning from the macro body. - */ -int macro_return_idx; - +/* C language syntactic analyzer */ int global_var_idx = 0; int global_label_idx = 0; char global_str_buf[MAX_VAR_LEN]; @@ -184,610 +43,6 @@ var_t *opstack_pop() return operand_stack[--operand_stack_idx]; } -void skip_whitespace() -{ - while (1) { - if (is_linebreak(next_char)) { - source_idx += 2; - next_char = SOURCE[source_idx]; - continue; - } - if (is_whitespace(next_char) || - (skip_newline && is_newline(next_char))) { - next_char = SOURCE[++source_idx]; - continue; - } - break; - } -} - -char read_char(int is_skip_space) -{ - next_char = SOURCE[++source_idx]; - if (is_skip_space == 1) - skip_whitespace(); - return next_char; -} - -/* get alias name from defined() directive - * i.e., get __arm__ from defined(__arm__) - */ -void read_alias_name_from_defined(char *alias_name, char *src) -{ - int i; - - src = src + 8; /* skip defined( */ - i = 0; - while (src[i] != ')') { - alias_name[i] = src[i]; - i++; - } - alias_name[i] = 0; -} - -char peek_char(int offset) -{ - return SOURCE[source_idx + offset]; -} - -void if_elif_skip_lines() -{ - char peek_c; - int i; - - do { - skip_whitespace(); - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - read_char(1); - peek_c = peek_char(1); - } while (next_char != '#' || (next_char == '#' && peek_c == 'd')); - skip_whitespace(); -} - -void ifdef_else_skip_lines() -{ - int i; - - do { - skip_whitespace(); - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - } while (strcmp(token_str, "#else") && strcmp(token_str, "#endif")); - skip_whitespace(); -} - -/* check alias defined or not */ -void chk_def(int defined) -{ - char *alias = NULL; - char alias_name[MAX_TOKEN_LEN]; - - if (defined) { - read_alias_name_from_defined(alias_name, token_str); - alias = find_alias(alias_name); - } else - alias = find_alias(token_str); - - if (alias) - preproc_match = 1; -} - -token_t get_next_token() -{ - token_str[0] = 0; - - /* partial preprocessor */ - if (next_char == '#') { - int i = 0; - - do { - token_str[i++] = next_char; - } while (is_alnum(read_char(0))); - token_str[i] = 0; - skip_whitespace(); - - if (!strcmp(token_str, "#include")) { - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - skip_whitespace(); - return T_include; - } - if (!strcmp(token_str, "#define")) { - skip_whitespace(); - return T_define; - } - if (!strcmp(token_str, "#undef")) { - skip_whitespace(); - return T_undef; - } - if (!strcmp(token_str, "#error")) { - skip_whitespace(); - return T_error; - } - if (!strcmp(token_str, "#if")) { - preproc_match = 0; - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - - if (!strncmp(token_str, "defined", 7)) { - chk_def(1); - if (preproc_match) { - skip_whitespace(); - return get_next_token(); - } - - /* skip lines until #elif or #else or #endif */ - if_elif_skip_lines(); - return get_next_token(); - } - } - if (!strcmp(token_str, "#elif")) { - if (preproc_match) { - do { - skip_whitespace(); - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - } while (strcmp(token_str, "#endif")); - skip_whitespace(); - return get_next_token(); - } - - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - - if (!strncmp(token_str, "defined", 7)) { - chk_def(1); - if (preproc_match) { - skip_whitespace(); - return get_next_token(); - } - /* skip lines until #elif or #else or #endif */ - if_elif_skip_lines(); - return get_next_token(); - } - } - if (!strcmp(token_str, "#ifdef")) { - preproc_match = 0; - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - chk_def(0); - if (preproc_match) { - skip_whitespace(); - return get_next_token(); - } - /* skip lines until #else or #endif */ - ifdef_else_skip_lines(); - return get_next_token(); - } - if (!strcmp(token_str, "#else")) { - /* reach here has 2 possible cases: - * 1. reach #ifdef preprocessor directive - * 2. conditional expression in #elif is false - */ - if (!preproc_match) { - skip_whitespace(); - return get_next_token(); - } - /* skip lines until #else or #endif */ - ifdef_else_skip_lines(); - return get_next_token(); - } - if (!strcmp(token_str, "#endif")) { - preproc_match = 0; - skip_whitespace(); - return get_next_token(); - } - error("Unknown directive"); - } - - /* C-style comments */ - if (next_char == '/') { - read_char(0); - if (next_char == '*') { - /* in a comment, skip until end */ - do { - read_char(0); - if (next_char == '*') { - read_char(0); - if (next_char == '/') { - read_char(1); - return get_next_token(); - } - } - } while (next_char); - } else { - /* single '/', predict divide */ - if (next_char == ' ') - read_char(1); - return T_divide; - } - /* TODO: check invalid cases */ - error("Unexpected '/'"); - } - - if (is_digit(next_char)) { - int i = 0; - do { - token_str[i++] = next_char; - } while (is_hex(read_char(0))); - token_str[i] = 0; - skip_whitespace(); - return T_numeric; - } - if (next_char == '(') { - read_char(1); - return T_open_bracket; - } - if (next_char == ')') { - read_char(1); - return T_close_bracket; - } - if (next_char == '{') { - read_char(1); - return T_open_curly; - } - if (next_char == '}') { - read_char(1); - return T_close_curly; - } - if (next_char == '[') { - read_char(1); - return T_open_square; - } - if (next_char == ']') { - read_char(1); - return T_close_square; - } - if (next_char == ',') { - read_char(1); - return T_comma; - } - if (next_char == '^') { - read_char(1); - return T_bit_xor; - } - if (next_char == '~') { - read_char(1); - return T_bit_not; - } - if (next_char == '"') { - int i = 0; - int special = 0; - - while ((read_char(0) != '"') || special) { - if ((i > 0) && (token_str[i - 1] == '\\')) { - if (next_char == 'n') - token_str[i - 1] = '\n'; - else if (next_char == '"') - token_str[i - 1] = '"'; - else if (next_char == 'r') - token_str[i - 1] = '\r'; - else if (next_char == '\'') - token_str[i - 1] = '\''; - else if (next_char == 't') - token_str[i - 1] = '\t'; - else if (next_char == '\\') - token_str[i - 1] = '\\'; - else - abort(); - } else { - token_str[i++] = next_char; - } - if (next_char == '\\') - special = 1; - else - special = 0; - } - token_str[i] = 0; - read_char(1); - return T_string; - } - if (next_char == '\'') { - read_char(0); - if (next_char == '\\') { - read_char(0); - if (next_char == 'n') - token_str[0] = '\n'; - else if (next_char == 'r') - token_str[0] = '\r'; - else if (next_char == '\'') - token_str[0] = '\''; - else if (next_char == '"') - token_str[0] = '"'; - else if (next_char == 't') - token_str[0] = '\t'; - else if (next_char == '\\') - token_str[0] = '\\'; - else - abort(); - } else { - token_str[0] = next_char; - } - token_str[1] = 0; - if (read_char(0) != '\'') - abort(); - read_char(1); - return T_char; - } - if (next_char == '*') { - read_char(1); - return T_asterisk; - } - if (next_char == '&') { - read_char(0); - if (next_char == '&') { - read_char(1); - return T_log_and; - }; - if (next_char == '=') { - read_char(1); - return T_andeq; - } - skip_whitespace(); - return T_ampersand; - } - if (next_char == '|') { - read_char(0); - if (next_char == '|') { - read_char(1); - return T_log_or; - }; - if (next_char == '=') { - read_char(1); - return T_oreq; - } - skip_whitespace(); - return T_bit_or; - } - if (next_char == '<') { - read_char(0); - if (next_char == '=') { - read_char(1); - return T_le; - }; - if (next_char == '<') { - read_char(1); - return T_lshift; - }; - skip_whitespace(); - return T_lt; - } - if (next_char == '%') { - read_char(1); - return T_mod; - } - if (next_char == '>') { - read_char(0); - if (next_char == '=') { - read_char(1); - return T_ge; - }; - if (next_char == '>') { - read_char(1); - return T_rshift; - }; - skip_whitespace(); - return T_gt; - } - if (next_char == '!') { - read_char(0); - if (next_char == '=') { - read_char(1); - return T_noteq; - } - skip_whitespace(); - return T_log_not; - } - if (next_char == '.') { - read_char(0); - if (next_char == '.') { - read_char(0); - if (next_char == '.') { - read_char(1); - return T_elipsis; - } - abort(); - } - skip_whitespace(); - return T_dot; - } - if (next_char == '-') { - read_char(0); - if (next_char == '>') { - read_char(1); - return T_arrow; - } - if (next_char == '-') { - read_char(1); - return T_decrement; - } - if (next_char == '=') { - read_char(1); - return T_minuseq; - } - skip_whitespace(); - return T_minus; - } - if (next_char == '+') { - read_char(0); - if (next_char == '+') { - read_char(1); - return T_increment; - } - if (next_char == '=') { - read_char(1); - return T_pluseq; - } - skip_whitespace(); - return T_plus; - } - if (next_char == ';') { - read_char(1); - return T_semicolon; - } - if (next_char == '?') { - read_char(1); - return T_question; - } - if (next_char == ':') { - read_char(1); - return T_colon; - } - if (next_char == '=') { - read_char(0); - if (next_char == '=') { - read_char(1); - return T_eq; - } - skip_whitespace(); - return T_assign; - } - - /* end of file */ - /* "FIXME: The signedness of 'char' in the C programming language is indeed - * implementation-specific. For example, gcc for Arm treats 'char' as - * unsigned, while gcc for x86(-64) treats 'char' as signed. The warning - * below is raised in gcc for Arm: - * warning: comparison is always false due to limited range of data type - * [-Wtype-limits] - */ - if ((next_char == 0) || (next_char == -1)) - return T_eof; - - if (is_alnum(next_char)) { - char *alias; - int i = 0; - do { - token_str[i++] = next_char; - } while (is_alnum(read_char(0))); - token_str[i] = 0; - skip_whitespace(); - - if (!strcmp(token_str, "if")) - return T_if; - if (!strcmp(token_str, "while")) - return T_while; - if (!strcmp(token_str, "for")) - return T_for; - if (!strcmp(token_str, "do")) - return T_do; - if (!strcmp(token_str, "else")) - return T_else; - if (!strcmp(token_str, "return")) - return T_return; - if (!strcmp(token_str, "typedef")) - return T_typedef; - if (!strcmp(token_str, "enum")) - return T_enum; - if (!strcmp(token_str, "struct")) - return T_struct; - if (!strcmp(token_str, "sizeof")) - return T_sizeof; - if (!strcmp(token_str, "switch")) - return T_switch; - if (!strcmp(token_str, "case")) - return T_case; - if (!strcmp(token_str, "break")) - return T_break; - if (!strcmp(token_str, "default")) - return T_default; - if (!strcmp(token_str, "continue")) - return T_continue; - - if (preproc_aliasing) { - alias = find_alias(token_str); - if (alias) { - token_t t = is_numeric(alias) ? T_numeric : T_string; - strcpy(token_str, alias); - return t; - } - } - - return T_identifier; - } - - /* - * This only happens when parsing a macro. Move to the token after the - * macro definition or return to where the macro has been called. - */ - if (next_char == '\n') { - if (macro_return_idx) { - source_idx = macro_return_idx; - next_char = SOURCE[source_idx]; - } else - next_char = read_char(1); - return get_next_token(); - } - - error("Unrecognized input"); - return T_eof; -} - -/* Skip the content. We only need the index where the macro body begins. */ -void skip_macro_body() -{ - while (!is_newline(next_char)) - next_token = get_next_token(); - - skip_newline = 1; - next_token = get_next_token(); -} - -int lex_accept(token_t token) -{ - if (next_token == token) { - next_token = get_next_token(); - return 1; - } - return 0; -} - -int lex_peek(token_t token, char *value) -{ - if (next_token == token) { - if (!value) - return 1; - strcpy(value, token_str); - return 1; - } - return 0; -} - -void lex_ident(token_t token, char *value) -{ - if (next_token != token) - error("Unexpected token"); - strcpy(value, token_str); - next_token = get_next_token(); -} - -void lex_expect(token_t token) -{ - if (next_token != token) - error("Unexpected token"); - next_token = get_next_token(); -} - void read_expr(block_t *parent, basic_block_t **bb); int write_symbol(char *data, int len) From fa9afabee905aaa280bc13c93ca2950cdfa12333 Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Fri, 15 Dec 2023 10:16:47 +0800 Subject: [PATCH 2/2] Refactor redundant assignment --- src/lexer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 9182045a..389d11d3 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -122,12 +122,10 @@ int is_hex(char c) int is_numeric(char buffer[]) { - int i, hex, size = strlen(buffer); + int i, hex = 0, size = strlen(buffer); if (size > 2) hex = (buffer[0] == '0' && buffer[1] == 'x') ? 1 : 0; - else - hex = 0; for (i = 0; i < size; i++) { if (hex && (is_hex(buffer[i]) == 0))