From mboxrd@z Thu Jan  1 00:00:00 1970
Authentication-Results: mail-a.sr.ht; dkim=pass header.d=johnnyrichard.com header.i=@johnnyrichard.com
Received: from out-181.mta0.migadu.com (out-181.mta0.migadu.com [91.218.175.181])
	by mail-a.sr.ht (Postfix) with ESMTPS id 40E4E2008A
	for <~johnnyrichard/olang-devel@lists.sr.ht>; Mon, 19 Feb 2024 00:20:11 +0000 (UTC)
X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers.
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=johnnyrichard.com;
	s=key1; t=1708302010;
	h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
	 to:to:cc:cc:mime-version:mime-version:
	 content-transfer-encoding:content-transfer-encoding:
	 in-reply-to:in-reply-to:references:references;
	bh=oUvAz3xS1bP3gX+Ee6pcgyJxQ0SEvuaCDVKGJt+CZqQ=;
	b=t+SDV6EugyOG1+mw13ZnuD2d2ssS+pT7T0gZbRA6ljILXHHFZwimwAzuwpkt9bLFSvcdrd
	wlEtzlRNP9HMqbzvUi+aS+4WwqWYaDpNa2PUWT6htQP8+C5O5D8vglR/conya6sxo0BnR0
	zcttKQ4i2h8mPnHiIfBzrIahNKD38mnU9Tg8sWNE9Vn8Po1r6aYvZiVfBSlF0njIlOsKCF
	gxAlEZiOQ6Kf3fp5eyImdK/Wy+fl7N+S2wQLUc6miybeZLu1hSNBeJz5WtqTUO4/11gG5/
	wUhXT5KLqBDqifEJpoKWMifz5JmdRb6CAhGStdZCSrCROOCLp0wcaAvhJTA/6Q==
From: Johnny Richard <johnny@johnnyrichard.com>
To: ~johnnyrichard/olang-devel@lists.sr.ht
Cc: Johnny Richard <johnny@johnnyrichard.com>
Subject: [PATCH olang 2/2] lexer: create --dump-tokens cli command
Date: Mon, 19 Feb 2024 02:15:39 +0100
Message-ID: <20240219011835.14769-3-johnny@johnnyrichard.com>
In-Reply-To: <20240219011835.14769-1-johnny@johnnyrichard.com>
References: <20240219011835.14769-1-johnny@johnnyrichard.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Migadu-Flow: FLOW_OUT
X-TUID: dEND2b52MeRt

This patch introduces the dump tokens interface and create the initial
setup for lexical analysis.

Signed-off-by: Johnny Richard <johnny@johnnyrichard.com>
---
 examples/main_exit.0 |   3 +
 src/0c.c             | 121 ++++++++++++++++++++++-
 src/lexer.c          | 224 +++++++++++++++++++++++++++++++++++++++++++
 src/lexer.h          |  74 ++++++++++++++
 4 files changed, 420 insertions(+), 2 deletions(-)
 create mode 100644 examples/main_exit.0
 create mode 100644 src/lexer.c
 create mode 100644 src/lexer.h

diff --git a/examples/main_exit.0 b/examples/main_exit.0
new file mode 100644
index 0000000..c86fc68
--- /dev/null
+++ b/examples/main_exit.0
@@ -0,0 +1,3 @@
+fn main(): u32 {
+  return 0
+}
diff --git a/src/0c.c b/src/0c.c
index 33ac945..e5199a7 100644
--- a/src/0c.c
+++ b/src/0c.c
@@ -14,8 +14,125 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lexer.h"
+#include "string_view.h"
+
+typedef struct cli_args
+{
+    int argc;
+    char **argv;
+} cli_args_t;
+
+char *
+cli_args_shift(cli_args_t *args);
+
+typedef struct cli_opts
+{
+    // TODO: create man page instruction for --dump-tokens option
+    bool dump_tokens;
+    char *file_path;
+} cli_opts_t;
+
+void
+print_usage(FILE *stream, char *prog);
+
+string_view_t
+read_entire_file(char *file_path);
+
 int
-main(void)
+main(int argc, char **argv)
+{
+    cli_args_t args = { .argc = argc, .argv = argv };
+    cli_opts_t opts = { 0 };
+
+    char *prog = cli_args_shift(&args);
+
+    if (argc != 3) {
+        print_usage(stderr, prog);
+        return EXIT_FAILURE;
+    }
+
+    for (char *arg = cli_args_shift(&args); arg != NULL; arg = cli_args_shift(&args)) {
+        if (strcmp(arg, "--dump-tokens") == 0) {
+            opts.dump_tokens = true;
+        } else {
+            opts.file_path = arg;
+        }
+    }
+
+    if (!opts.dump_tokens) {
+        print_usage(stderr, prog);
+        return EXIT_FAILURE;
+    }
+
+    string_view_t file_content = read_entire_file(opts.file_path);
+
+    // TODO: missing integration test for lexer tokenizing
+    lexer_t lexer = { 0 };
+    lexer_init(&lexer, file_content);
+
+    token_t token = { 0 };
+    lexer_next_token(&lexer, &token);
+    while (token.kind != TOKEN_EOF) {
+        printf("%s:%lu:%lu: <%s>\n",
+               opts.file_path,
+               token.location.row + 1,
+               (token.location.offset - token.location.bol) + 1,
+               token_kind_to_cstr(token.kind));
+        lexer_next_token(&lexer, &token);
+    }
+
+    free(file_content.chars);
+
+    return EXIT_SUCCESS;
+}
+
+char *
+cli_args_shift(cli_args_t *args)
+{
+    if (args->argc == 0)
+        return NULL;
+    --(args->argc);
+    return *(args->argv)++;
+}
+
+void
+print_usage(FILE *stream, char *prog)
+{
+    fprintf(stream, "usage: %s <source.0> --dump-tokens\n", prog);
+}
+
+string_view_t
+read_entire_file(char *file_path)
 {
-    return 0;
+    FILE *stream = fopen(file_path, "rb");
+
+    if (stream == NULL) {
+        fprintf(stderr, "Could not open file %s: %s\n", file_path, strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    string_view_t file_content = { 0 };
+
+    fseek(stream, 0, SEEK_END);
+    file_content.size = ftell(stream);
+    fseek(stream, 0, SEEK_SET);
+
+    file_content.chars = (char *)malloc(file_content.size);
+
+    if (file_content.chars == NULL) {
+        fprintf(stderr, "Could not read file %s: %s\n", file_path, strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    fread(file_content.chars, 1, file_content.size, stream);
+    fclose(stream);
+
+    return file_content;
 }
diff --git a/src/lexer.c b/src/lexer.c
new file mode 100644
index 0000000..7866a9a
--- /dev/null
+++ b/src/lexer.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (C) 2024 olang maintainers
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+#include "lexer.h"
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdbool.h>
+
+void
+lexer_init(lexer_t *lexer, string_view_t source)
+{
+    assert(lexer);
+    lexer->source = source;
+    lexer->offset = 0;
+    lexer->row = 0;
+    lexer->bol = 0;
+}
+
+static char
+lexer_next_char(lexer_t *lexer);
+
+static void
+lexer_skip_char(lexer_t *lexer);
+
+static bool
+lexer_is_eof(lexer_t *lexer);
+
+static bool
+lexer_is_not_eof(lexer_t *lexer);
+
+static bool
+_isspace(char c);
+
+static void
+lexer_init_char_token(lexer_t *lexer, token_t *token, token_kind_t kind);
+
+static void
+lexer_init_str_token(lexer_t *lexer, token_t *token, token_kind_t kind, size_t start_offset);
+
+static token_kind_t
+lexer_str_to_token_kind(string_view_t text);
+
+void
+lexer_next_token(lexer_t *lexer, token_t *token)
+{
+    if (lexer_is_eof(lexer)) {
+        *token = (token_t){ .kind = TOKEN_EOF };
+        return;
+    }
+
+    char current_char = lexer_next_char(lexer);
+
+    if (_isspace(current_char)) {
+        while (_isspace(current_char) && lexer_is_not_eof(lexer)) {
+            lexer_skip_char(lexer);
+            current_char = lexer_next_char(lexer);
+        }
+    }
+
+    while (lexer_is_not_eof(lexer)) {
+        if (isalpha(current_char)) {
+            size_t start_offset = lexer->offset;
+            while (isalnum(current_char) && lexer_is_not_eof(lexer)) {
+                lexer_skip_char(lexer);
+                current_char = lexer_next_char(lexer);
+            }
+
+            string_view_t text = { .chars = lexer->source.chars + start_offset, .size = lexer->offset - start_offset };
+
+            lexer_init_str_token(lexer, token, lexer_str_to_token_kind(text), start_offset);
+            return;
+        }
+
+        if (isdigit(current_char)) {
+            size_t start_offset = lexer->offset;
+            while (isdigit(current_char) && lexer_is_not_eof(lexer)) {
+                lexer_skip_char(lexer);
+                current_char = lexer_next_char(lexer);
+            }
+
+            lexer_init_str_token(lexer, token, TOKEN_NUMBER, start_offset);
+            return;
+        }
+
+        switch (current_char) {
+            case '(': {
+                lexer_init_char_token(lexer, token, TOKEN_OPAREN);
+                lexer_skip_char(lexer);
+                return;
+            }
+            case ')': {
+                lexer_init_char_token(lexer, token, TOKEN_CPAREN);
+                lexer_skip_char(lexer);
+                return;
+            }
+            case ':': {
+                lexer_init_char_token(lexer, token, TOKEN_COLON);
+                lexer_skip_char(lexer);
+                return;
+            }
+            case '{': {
+                lexer_init_char_token(lexer, token, TOKEN_OCURLY);
+                lexer_skip_char(lexer);
+                return;
+            }
+            case '}': {
+                lexer_init_char_token(lexer, token, TOKEN_CCURLY);
+                lexer_skip_char(lexer);
+                return;
+            }
+            case '\n': {
+                lexer_init_char_token(lexer, token, TOKEN_LF);
+                lexer_skip_char(lexer);
+                return;
+            }
+            default: {
+                lexer_init_char_token(lexer, token, TOKEN_UNKNOWN);
+                lexer_skip_char(lexer);
+                return;
+            }
+        }
+    }
+
+    if (lexer_is_eof(lexer)) {
+        *token = (token_t){ .kind = TOKEN_EOF };
+        return;
+    }
+}
+
+static char *token_kind_str_table[] = {
+    [TOKEN_UNKNOWN] = "unknown", [TOKEN_IDENTIFIER] = "identifier",
+    [TOKEN_NUMBER] = "number",   [TOKEN_FN] = "fn",
+    [TOKEN_RETURN] = "return",   [TOKEN_LF] = "line_feed",
+    [TOKEN_OPAREN] = "(",        [TOKEN_CPAREN] = ")",
+    [TOKEN_COLON] = ":",         [TOKEN_OCURLY] = "{",
+    [TOKEN_CCURLY] = "}",        [TOKEN_EOF] = "EOF",
+};
+
+char *
+token_kind_to_cstr(token_kind_t kind)
+{
+    assert(kind < sizeof(token_kind_str_table));
+    return token_kind_str_table[kind];
+}
+
+static char
+lexer_next_char(lexer_t *lexer)
+{
+    return lexer->source.chars[lexer->offset];
+}
+
+static void
+lexer_skip_char(lexer_t *lexer)
+{
+    assert(lexer->offset < lexer->source.size);
+    if (lexer->source.chars[lexer->offset] == '\n') {
+        lexer->row++;
+        lexer->bol = ++lexer->offset;
+    } else {
+      lexer->offset++;
+    }
+}
+
+static bool
+lexer_is_eof(lexer_t *lexer)
+{
+    return lexer->offset >= lexer->source.size;
+}
+
+static bool
+lexer_is_not_eof(lexer_t *lexer)
+{
+    return !lexer_is_eof(lexer);
+}
+
+static bool
+_isspace(char c)
+{
+    return c == ' ' || c == '\f' || c == '\r' || c == '\t' || c == '\v';
+}
+
+static void
+lexer_init_char_token(lexer_t *lexer, token_t *token, token_kind_t kind)
+{
+    string_view_t str = { .chars = lexer->source.chars + lexer->offset, .size = 1 };
+    token_loc_t location = { .offset = lexer->offset, .row = lexer->row, .bol = lexer->bol };
+    *token = (token_t){ .kind = kind, .value = str, .location = location };
+}
+
+static void
+lexer_init_str_token(lexer_t *lexer, token_t *token, token_kind_t kind, size_t start_offset)
+{
+    string_view_t str = { .chars = lexer->source.chars + start_offset, .size = lexer->offset - start_offset };
+    token_loc_t location = { .offset = start_offset, .row = lexer->row, .bol = lexer->bol };
+    *token = (token_t){ .kind = kind, .value = str, .location = location };
+}
+
+static token_kind_t
+lexer_str_to_token_kind(string_view_t text)
+{
+    if (string_view_eq_to_cstr(text, "return")) {
+        return TOKEN_RETURN;
+    }
+
+    if (string_view_eq_to_cstr(text, "fn")) {
+        return TOKEN_FN;
+    }
+
+    return TOKEN_IDENTIFIER;
+}
diff --git a/src/lexer.h b/src/lexer.h
new file mode 100644
index 0000000..8c09e02
--- /dev/null
+++ b/src/lexer.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2024 olang maintainers
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+#ifndef LEXER_H
+#define LEXER_H
+
+#include "string_view.h"
+#include <stdint.h>
+
+typedef struct lexer
+{
+    string_view_t source;
+    size_t offset;
+    size_t row;
+    size_t bol;
+} lexer_t;
+
+typedef enum token_kind
+{
+    TOKEN_UNKNOWN,
+    TOKEN_IDENTIFIER,
+    TOKEN_NUMBER,
+
+    // Keywords
+    TOKEN_FN,
+    TOKEN_RETURN,
+
+    // Single char
+    TOKEN_LF,
+    TOKEN_OPAREN,
+    TOKEN_CPAREN,
+    TOKEN_COLON,
+    TOKEN_OCURLY,
+    TOKEN_CCURLY,
+    TOKEN_EOF
+} token_kind_t;
+
+typedef struct token_loc
+{
+    size_t offset;
+    size_t row;
+    size_t bol;
+} token_loc_t;
+
+typedef struct token
+{
+    token_kind_t kind;
+    string_view_t value;
+    token_loc_t location;
+} token_t;
+
+void
+lexer_init(lexer_t *lexer, string_view_t source);
+
+void
+lexer_next_token(lexer_t *lexer, token_t *token);
+
+char *
+token_kind_to_cstr(token_kind_t kind);
+
+#endif /* LEXER_H */
-- 
2.43.2