From mboxrd@z Thu Jan 1 00:00:00 1970 Authentication-Results: mail-a.sr.ht; dkim=pass header.d=johnnyrichard.com header.i=@johnnyrichard.com Received: from out-181.mta0.migadu.com (out-181.mta0.migadu.com [91.218.175.181]) by mail-a.sr.ht (Postfix) with ESMTPS id 40E4E2008A for <~johnnyrichard/olang-devel@lists.sr.ht>; Mon, 19 Feb 2024 00:20:11 +0000 (UTC) X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=johnnyrichard.com; s=key1; t=1708302010; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=oUvAz3xS1bP3gX+Ee6pcgyJxQ0SEvuaCDVKGJt+CZqQ=; b=t+SDV6EugyOG1+mw13ZnuD2d2ssS+pT7T0gZbRA6ljILXHHFZwimwAzuwpkt9bLFSvcdrd wlEtzlRNP9HMqbzvUi+aS+4WwqWYaDpNa2PUWT6htQP8+C5O5D8vglR/conya6sxo0BnR0 zcttKQ4i2h8mPnHiIfBzrIahNKD38mnU9Tg8sWNE9Vn8Po1r6aYvZiVfBSlF0njIlOsKCF gxAlEZiOQ6Kf3fp5eyImdK/Wy+fl7N+S2wQLUc6miybeZLu1hSNBeJz5WtqTUO4/11gG5/ wUhXT5KLqBDqifEJpoKWMifz5JmdRb6CAhGStdZCSrCROOCLp0wcaAvhJTA/6Q== From: Johnny Richard To: ~johnnyrichard/olang-devel@lists.sr.ht Cc: Johnny Richard Subject: [PATCH olang 2/2] lexer: create --dump-tokens cli command Date: Mon, 19 Feb 2024 02:15:39 +0100 Message-ID: <20240219011835.14769-3-johnny@johnnyrichard.com> In-Reply-To: <20240219011835.14769-1-johnny@johnnyrichard.com> References: <20240219011835.14769-1-johnny@johnnyrichard.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Migadu-Flow: FLOW_OUT X-TUID: dEND2b52MeRt This patch introduces the dump tokens interface and create the initial setup for lexical analysis. Signed-off-by: Johnny Richard --- examples/main_exit.0 | 3 + src/0c.c | 121 ++++++++++++++++++++++- src/lexer.c | 224 +++++++++++++++++++++++++++++++++++++++++++ src/lexer.h | 74 ++++++++++++++ 4 files changed, 420 insertions(+), 2 deletions(-) create mode 100644 examples/main_exit.0 create mode 100644 src/lexer.c create mode 100644 src/lexer.h diff --git a/examples/main_exit.0 b/examples/main_exit.0 new file mode 100644 index 0000000..c86fc68 --- /dev/null +++ b/examples/main_exit.0 @@ -0,0 +1,3 @@ +fn main(): u32 { + return 0 +} diff --git a/src/0c.c b/src/0c.c index 33ac945..e5199a7 100644 --- a/src/0c.c +++ b/src/0c.c @@ -14,8 +14,125 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +#include +#include +#include +#include +#include + +#include "lexer.h" +#include "string_view.h" + +typedef struct cli_args +{ + int argc; + char **argv; +} cli_args_t; + +char * +cli_args_shift(cli_args_t *args); + +typedef struct cli_opts +{ + // TODO: create man page instruction for --dump-tokens option + bool dump_tokens; + char *file_path; +} cli_opts_t; + +void +print_usage(FILE *stream, char *prog); + +string_view_t +read_entire_file(char *file_path); + int -main(void) +main(int argc, char **argv) +{ + cli_args_t args = { .argc = argc, .argv = argv }; + cli_opts_t opts = { 0 }; + + char *prog = cli_args_shift(&args); + + if (argc != 3) { + print_usage(stderr, prog); + return EXIT_FAILURE; + } + + for (char *arg = cli_args_shift(&args); arg != NULL; arg = cli_args_shift(&args)) { + if (strcmp(arg, "--dump-tokens") == 0) { + opts.dump_tokens = true; + } else { + opts.file_path = arg; + } + } + + if (!opts.dump_tokens) { + print_usage(stderr, prog); + return EXIT_FAILURE; + } + + string_view_t file_content = read_entire_file(opts.file_path); + + // TODO: missing integration test for lexer tokenizing + lexer_t lexer = { 0 }; + lexer_init(&lexer, file_content); + + token_t token = { 0 }; + lexer_next_token(&lexer, &token); + while (token.kind != TOKEN_EOF) { + printf("%s:%lu:%lu: <%s>\n", + opts.file_path, + token.location.row + 1, + (token.location.offset - token.location.bol) + 1, + token_kind_to_cstr(token.kind)); + lexer_next_token(&lexer, &token); + } + + free(file_content.chars); + + return EXIT_SUCCESS; +} + +char * +cli_args_shift(cli_args_t *args) +{ + if (args->argc == 0) + return NULL; + --(args->argc); + return *(args->argv)++; +} + +void +print_usage(FILE *stream, char *prog) +{ + fprintf(stream, "usage: %s --dump-tokens\n", prog); +} + +string_view_t +read_entire_file(char *file_path) { - return 0; + FILE *stream = fopen(file_path, "rb"); + + if (stream == NULL) { + fprintf(stderr, "Could not open file %s: %s\n", file_path, strerror(errno)); + exit(EXIT_FAILURE); + } + + string_view_t file_content = { 0 }; + + fseek(stream, 0, SEEK_END); + file_content.size = ftell(stream); + fseek(stream, 0, SEEK_SET); + + file_content.chars = (char *)malloc(file_content.size); + + if (file_content.chars == NULL) { + fprintf(stderr, "Could not read file %s: %s\n", file_path, strerror(errno)); + exit(EXIT_FAILURE); + } + + fread(file_content.chars, 1, file_content.size, stream); + fclose(stream); + + return file_content; } diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..7866a9a --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2024 olang maintainers + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include "lexer.h" + +#include +#include +#include + +void +lexer_init(lexer_t *lexer, string_view_t source) +{ + assert(lexer); + lexer->source = source; + lexer->offset = 0; + lexer->row = 0; + lexer->bol = 0; +} + +static char +lexer_next_char(lexer_t *lexer); + +static void +lexer_skip_char(lexer_t *lexer); + +static bool +lexer_is_eof(lexer_t *lexer); + +static bool +lexer_is_not_eof(lexer_t *lexer); + +static bool +_isspace(char c); + +static void +lexer_init_char_token(lexer_t *lexer, token_t *token, token_kind_t kind); + +static void +lexer_init_str_token(lexer_t *lexer, token_t *token, token_kind_t kind, size_t start_offset); + +static token_kind_t +lexer_str_to_token_kind(string_view_t text); + +void +lexer_next_token(lexer_t *lexer, token_t *token) +{ + if (lexer_is_eof(lexer)) { + *token = (token_t){ .kind = TOKEN_EOF }; + return; + } + + char current_char = lexer_next_char(lexer); + + if (_isspace(current_char)) { + while (_isspace(current_char) && lexer_is_not_eof(lexer)) { + lexer_skip_char(lexer); + current_char = lexer_next_char(lexer); + } + } + + while (lexer_is_not_eof(lexer)) { + if (isalpha(current_char)) { + size_t start_offset = lexer->offset; + while (isalnum(current_char) && lexer_is_not_eof(lexer)) { + lexer_skip_char(lexer); + current_char = lexer_next_char(lexer); + } + + string_view_t text = { .chars = lexer->source.chars + start_offset, .size = lexer->offset - start_offset }; + + lexer_init_str_token(lexer, token, lexer_str_to_token_kind(text), start_offset); + return; + } + + if (isdigit(current_char)) { + size_t start_offset = lexer->offset; + while (isdigit(current_char) && lexer_is_not_eof(lexer)) { + lexer_skip_char(lexer); + current_char = lexer_next_char(lexer); + } + + lexer_init_str_token(lexer, token, TOKEN_NUMBER, start_offset); + return; + } + + switch (current_char) { + case '(': { + lexer_init_char_token(lexer, token, TOKEN_OPAREN); + lexer_skip_char(lexer); + return; + } + case ')': { + lexer_init_char_token(lexer, token, TOKEN_CPAREN); + lexer_skip_char(lexer); + return; + } + case ':': { + lexer_init_char_token(lexer, token, TOKEN_COLON); + lexer_skip_char(lexer); + return; + } + case '{': { + lexer_init_char_token(lexer, token, TOKEN_OCURLY); + lexer_skip_char(lexer); + return; + } + case '}': { + lexer_init_char_token(lexer, token, TOKEN_CCURLY); + lexer_skip_char(lexer); + return; + } + case '\n': { + lexer_init_char_token(lexer, token, TOKEN_LF); + lexer_skip_char(lexer); + return; + } + default: { + lexer_init_char_token(lexer, token, TOKEN_UNKNOWN); + lexer_skip_char(lexer); + return; + } + } + } + + if (lexer_is_eof(lexer)) { + *token = (token_t){ .kind = TOKEN_EOF }; + return; + } +} + +static char *token_kind_str_table[] = { + [TOKEN_UNKNOWN] = "unknown", [TOKEN_IDENTIFIER] = "identifier", + [TOKEN_NUMBER] = "number", [TOKEN_FN] = "fn", + [TOKEN_RETURN] = "return", [TOKEN_LF] = "line_feed", + [TOKEN_OPAREN] = "(", [TOKEN_CPAREN] = ")", + [TOKEN_COLON] = ":", [TOKEN_OCURLY] = "{", + [TOKEN_CCURLY] = "}", [TOKEN_EOF] = "EOF", +}; + +char * +token_kind_to_cstr(token_kind_t kind) +{ + assert(kind < sizeof(token_kind_str_table)); + return token_kind_str_table[kind]; +} + +static char +lexer_next_char(lexer_t *lexer) +{ + return lexer->source.chars[lexer->offset]; +} + +static void +lexer_skip_char(lexer_t *lexer) +{ + assert(lexer->offset < lexer->source.size); + if (lexer->source.chars[lexer->offset] == '\n') { + lexer->row++; + lexer->bol = ++lexer->offset; + } else { + lexer->offset++; + } +} + +static bool +lexer_is_eof(lexer_t *lexer) +{ + return lexer->offset >= lexer->source.size; +} + +static bool +lexer_is_not_eof(lexer_t *lexer) +{ + return !lexer_is_eof(lexer); +} + +static bool +_isspace(char c) +{ + return c == ' ' || c == '\f' || c == '\r' || c == '\t' || c == '\v'; +} + +static void +lexer_init_char_token(lexer_t *lexer, token_t *token, token_kind_t kind) +{ + string_view_t str = { .chars = lexer->source.chars + lexer->offset, .size = 1 }; + token_loc_t location = { .offset = lexer->offset, .row = lexer->row, .bol = lexer->bol }; + *token = (token_t){ .kind = kind, .value = str, .location = location }; +} + +static void +lexer_init_str_token(lexer_t *lexer, token_t *token, token_kind_t kind, size_t start_offset) +{ + string_view_t str = { .chars = lexer->source.chars + start_offset, .size = lexer->offset - start_offset }; + token_loc_t location = { .offset = start_offset, .row = lexer->row, .bol = lexer->bol }; + *token = (token_t){ .kind = kind, .value = str, .location = location }; +} + +static token_kind_t +lexer_str_to_token_kind(string_view_t text) +{ + if (string_view_eq_to_cstr(text, "return")) { + return TOKEN_RETURN; + } + + if (string_view_eq_to_cstr(text, "fn")) { + return TOKEN_FN; + } + + return TOKEN_IDENTIFIER; +} diff --git a/src/lexer.h b/src/lexer.h new file mode 100644 index 0000000..8c09e02 --- /dev/null +++ b/src/lexer.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2024 olang maintainers + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#ifndef LEXER_H +#define LEXER_H + +#include "string_view.h" +#include + +typedef struct lexer +{ + string_view_t source; + size_t offset; + size_t row; + size_t bol; +} lexer_t; + +typedef enum token_kind +{ + TOKEN_UNKNOWN, + TOKEN_IDENTIFIER, + TOKEN_NUMBER, + + // Keywords + TOKEN_FN, + TOKEN_RETURN, + + // Single char + TOKEN_LF, + TOKEN_OPAREN, + TOKEN_CPAREN, + TOKEN_COLON, + TOKEN_OCURLY, + TOKEN_CCURLY, + TOKEN_EOF +} token_kind_t; + +typedef struct token_loc +{ + size_t offset; + size_t row; + size_t bol; +} token_loc_t; + +typedef struct token +{ + token_kind_t kind; + string_view_t value; + token_loc_t location; +} token_t; + +void +lexer_init(lexer_t *lexer, string_view_t source); + +void +lexer_next_token(lexer_t *lexer, token_t *token); + +char * +token_kind_to_cstr(token_kind_t kind); + +#endif /* LEXER_H */ -- 2.43.2