public inbox for ~johnnyrichard/olang-devel@lists.sr.ht
 help / color / mirror / code / Atom feed
* [PATCH olang v1] lexer: add lexer cursor abstraction
@ 2024-10-04 17:51 Johnny Richard
  2024-10-04 15:52 ` [olang/patches/.build.yml] build success builds.sr.ht
  2024-10-04 16:05 ` [PATCH olang v1] lexer: add lexer cursor abstraction Carlos Maniero
  0 siblings, 2 replies; 3+ messages in thread
From: Johnny Richard @ 2024-10-04 17:51 UTC (permalink / raw)
  To: ~johnnyrichard/olang-devel; +Cc: Johnny Richard

In order to simplify the navigation and lexer state we are using a
common structure between tokens and lexer.

Signed-off-by: Johnny Richard <johnny@johnnyrichard.com>
---
 src/lexer.c  | 96 ++++++++++++++++++++++++----------------------------
 src/lexer.h  | 18 +++++-----
 src/main.c   |  4 +--
 src/parser.c |  6 ++--
 4 files changed, 58 insertions(+), 66 deletions(-)

diff --git a/src/lexer.c b/src/lexer.c
index 6fe0151..8de40a0 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -26,9 +26,9 @@ lexer_init(lexer_t *lexer, string_view_t source)
 {
     assert(lexer);
     lexer->source = source;
-    lexer->offset = 0;
-    lexer->row = 0;
-    lexer->bol = 0;
+    lexer->cur.offset = 0;
+    lexer->cur.row = 0;
+    lexer->cur.bol = 0;
 }
 
 static char
@@ -50,7 +50,7 @@ static void
 lexer_init_char_value_token(lexer_t *lexer, token_t *token, token_kind_t kind);
 
 static void
-lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, size_t start_offset);
+lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, lexer_cursor_t cur);
 
 static void
 lexer_init_eof_token(lexer_t *lexer, token_t *token);
@@ -84,120 +84,121 @@ lexer_next_token(lexer_t *lexer, token_t *token)
         }
 
         if (isalpha(current_char)) {
-            size_t start_offset = lexer->offset;
+            lexer_cursor_t start_cur = lexer->cur;
             while (isalnum(current_char) && lexer_is_not_eof(lexer)) {
                 lexer_skip_char(lexer);
                 current_char = lexer_current_char(lexer);
             }
 
-            string_view_t text = { .chars = lexer->source.chars + start_offset, .size = lexer->offset - start_offset };
+            string_view_t text = { .chars = lexer->source.chars + start_cur.offset,
+                                   .size = lexer->cur.offset - start_cur.offset };
 
-            lexer_init_str_value_token(lexer, token, lexer_str_to_token_kind(text), start_offset);
+            lexer_init_str_value_token(lexer, token, lexer_str_to_token_kind(text), start_cur);
             return;
         }
 
         if (isdigit(current_char)) {
-            size_t start_offset = lexer->offset;
+            lexer_cursor_t start_cur = lexer->cur;
             while (isdigit(current_char) && lexer_is_not_eof(lexer)) {
                 lexer_skip_char(lexer);
                 current_char = lexer_current_char(lexer);
             }
 
-            lexer_init_str_value_token(lexer, token, TOKEN_NUMBER, start_offset);
+            lexer_init_str_value_token(lexer, token, TOKEN_NUMBER, start_cur);
             return;
         }
 
         switch (current_char) {
             case '=': {
-                size_t start_offset = lexer->offset;
+                lexer_cursor_t start_cur = lexer->cur;
                 lexer_skip_char(lexer);
 
                 if (lexer_current_char(lexer) == '=') {
                     lexer_skip_char(lexer);
-                    lexer_init_str_value_token(lexer, token, TOKEN_CMP_EQ, start_offset);
+                    lexer_init_str_value_token(lexer, token, TOKEN_CMP_EQ, start_cur);
                     return;
                 }
 
-                lexer_init_str_value_token(lexer, token, TOKEN_EQ, start_offset);
+                lexer_init_str_value_token(lexer, token, TOKEN_EQ, start_cur);
                 return;
             }
             case '!': {
-                size_t start_offset = lexer->offset;
+                lexer_cursor_t start_cur = lexer->cur;
                 lexer_skip_char(lexer);
 
                 if (lexer_current_char(lexer) == '=') {
                     lexer_skip_char(lexer);
-                    lexer_init_str_value_token(lexer, token, TOKEN_CMP_NEQ, start_offset);
+                    lexer_init_str_value_token(lexer, token, TOKEN_CMP_NEQ, start_cur);
                     return;
                 }
 
-                lexer_init_str_value_token(lexer, token, TOKEN_BANG, start_offset);
+                lexer_init_str_value_token(lexer, token, TOKEN_BANG, start_cur);
                 return;
             }
             case '&': {
-                size_t start_offset = lexer->offset;
+                lexer_cursor_t start_cur = lexer->cur;
                 lexer_skip_char(lexer);
 
                 if (lexer_current_char(lexer) == '&') {
                     lexer_skip_char(lexer);
-                    lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_AND, start_offset);
+                    lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_AND, start_cur);
                     return;
                 }
 
-                lexer_init_str_value_token(lexer, token, TOKEN_AND, start_offset);
+                lexer_init_str_value_token(lexer, token, TOKEN_AND, start_cur);
                 return;
             }
             case '|': {
-                size_t start_offset = lexer->offset;
+                lexer_cursor_t start_cur = lexer->cur;
                 lexer_skip_char(lexer);
 
                 if (lexer_current_char(lexer) == '|') {
                     lexer_skip_char(lexer);
-                    lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_OR, start_offset);
+                    lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_OR, start_cur);
                     return;
                 }
 
-                lexer_init_str_value_token(lexer, token, TOKEN_PIPE, start_offset);
+                lexer_init_str_value_token(lexer, token, TOKEN_PIPE, start_cur);
                 return;
             }
             case '<': {
-                size_t start_offset = lexer->offset;
+                lexer_cursor_t start_cur = lexer->cur;
                 lexer_skip_char(lexer);
 
                 switch (lexer_current_char(lexer)) {
                     case '<': {
                         lexer_skip_char(lexer);
-                        lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_LSHIFT, start_offset);
+                        lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_LSHIFT, start_cur);
                         return;
                     }
                     case '=': {
                         lexer_skip_char(lexer);
-                        lexer_init_str_value_token(lexer, token, TOKEN_CMP_LEQ, start_offset);
+                        lexer_init_str_value_token(lexer, token, TOKEN_CMP_LEQ, start_cur);
                         return;
                     }
                     default: {
-                        lexer_init_str_value_token(lexer, token, TOKEN_LT, start_offset);
+                        lexer_init_str_value_token(lexer, token, TOKEN_LT, start_cur);
                         return;
                     }
                 }
             }
             case '>': {
-                size_t start_offset = lexer->offset;
+                lexer_cursor_t start_cur = lexer->cur;
                 lexer_skip_char(lexer);
 
                 switch (lexer_current_char(lexer)) {
                     case '>': {
                         lexer_skip_char(lexer);
-                        lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_RSHIFT, start_offset);
+                        lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_RSHIFT, start_cur);
                         return;
                     }
                     case '=': {
                         lexer_skip_char(lexer);
-                        lexer_init_str_value_token(lexer, token, TOKEN_CMP_GEQ, start_offset);
+                        lexer_init_str_value_token(lexer, token, TOKEN_CMP_GEQ, start_cur);
                         return;
                     }
                     default: {
-                        lexer_init_str_value_token(lexer, token, TOKEN_GT, start_offset);
+                        lexer_init_str_value_token(lexer, token, TOKEN_GT, start_cur);
                         return;
                     }
                 }
@@ -358,25 +359,25 @@ token_kind_is_binary_op(token_kind_t kind)
 static char
 lexer_current_char(lexer_t *lexer)
 {
-    return lexer->source.chars[lexer->offset];
+    return lexer->source.chars[lexer->cur.offset];
 }
 
 static void
 lexer_skip_char(lexer_t *lexer)
 {
-    assert(lexer->offset < lexer->source.size);
+    assert(lexer->cur.offset < lexer->source.size);
     if (lexer_current_char(lexer) == '\n') {
-        lexer->row++;
-        lexer->bol = ++lexer->offset;
+        lexer->cur.row++;
+        lexer->cur.bol = ++lexer->cur.offset;
     } else {
-        lexer->offset++;
+        lexer->cur.offset++;
     }
 }
 
 static bool
 lexer_is_eof(lexer_t *lexer)
 {
-    return lexer->offset >= lexer->source.size;
+    return lexer->cur.offset >= lexer->source.size;
 }
 
 static bool
@@ -394,25 +395,22 @@ _isspace(char c)
 static void
 lexer_init_char_value_token(lexer_t *lexer, token_t *token, token_kind_t kind)
 {
-    string_view_t str = { .chars = lexer->source.chars + lexer->offset, .size = 1 };
-    token_loc_t location = { .offset = lexer->offset, .row = lexer->row, .bol = lexer->bol };
-    *token = (token_t){ .kind = kind, .value = str, .location = location };
+    string_view_t str = { .chars = lexer->source.chars + lexer->cur.offset, .size = 1 };
+    *token = (token_t){ .kind = kind, .value = str, .cur = lexer->cur };
 }
 
 static void
-lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, size_t start_offset)
+lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, lexer_cursor_t cur)
 {
-    string_view_t str = { .chars = lexer->source.chars + start_offset, .size = lexer->offset - start_offset };
-    token_loc_t location = { .offset = start_offset, .row = lexer->row, .bol = lexer->bol };
-    *token = (token_t){ .kind = kind, .value = str, .location = location };
+    string_view_t str = { .chars = lexer->source.chars + cur.offset, .size = lexer->cur.offset - cur.offset };
+    *token = (token_t){ .kind = kind, .value = str, .cur = cur };
 }
 
 static void
 lexer_init_eof_token(lexer_t *lexer, token_t *token)
 {
     string_view_t str = { 0 };
-    token_loc_t location = { .offset = lexer->offset, .row = lexer->row, .bol = lexer->bol };
-    *token = (token_t){ .kind = TOKEN_EOF, .value = str, .location = location };
+    *token = (token_t){ .kind = TOKEN_EOF, .value = str, .cur = lexer->cur };
 }
 
 static token_kind_t
@@ -450,23 +448,19 @@ lexer_peek_next(lexer_t *lexer, token_t *token)
 void
 lexer_lookahead(lexer_t *lexer, token_t *token, size_t n)
 {
-    size_t previous_offset = lexer->offset;
-    size_t previous_row = lexer->row;
-    size_t previous_bol = lexer->bol;
+    lexer_cursor_t previous_cur = lexer->cur;
 
     for (size_t i = 0; i < n; ++i) {
         lexer_next_token(lexer, token);
     }
 
-    lexer->offset = previous_offset;
-    lexer->row = previous_row;
-    lexer->bol = previous_bol;
+    lexer->cur = previous_cur;
 }
 
 string_view_t
 lexer_get_token_line(lexer_t *lexer, token_t *token)
 {
-    size_t offset = token->location.bol;
+    size_t offset = token->cur.bol;
     string_view_t line = { .chars = lexer->source.chars + offset, .size = 0 };
 
     while ((line.size + offset) < lexer->source.size && line.chars[line.size] != '\n' && line.chars[line.size] != 0) {
diff --git a/src/lexer.h b/src/lexer.h
index 2746e3e..1aecb11 100644
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -21,12 +21,17 @@
 #include <stdint.h>
 #include <stdio.h>
 
-typedef struct lexer
+typedef struct lexer_cursor
 {
-    string_view_t source;
     size_t offset;
     size_t row;
     size_t bol;
+} lexer_cursor_t;
+
+typedef struct lexer
+{
+    string_view_t source;
+    lexer_cursor_t cur;
 } lexer_t;
 
 typedef enum token_kind
@@ -79,18 +84,11 @@ typedef enum token_kind
     TOKEN_EOF
 } token_kind_t;
 
-typedef struct token_loc
-{
-    size_t offset;
-    size_t row;
-    size_t bol;
-} token_loc_t;
-
 typedef struct token
 {
     token_kind_t kind;
     string_view_t value;
-    token_loc_t location;
+    lexer_cursor_t cur;
 } token_t;
 
 void
diff --git a/src/main.c b/src/main.c
index 60b17bf..9d66455 100644
--- a/src/main.c
+++ b/src/main.c
@@ -246,7 +246,7 @@ print_token(char *file_path, token_t *token)
 {
     printf("%s:%lu:%lu: <%s>\n",
            file_path,
-           token->location.row + 1,
-           (token->location.offset - token->location.bol) + 1,
+           token->cur.row + 1,
+           (token->cur.offset - token->cur.bol) + 1,
            token_kind_to_cstr(token->kind));
 }
diff --git a/src/parser.c b/src/parser.c
index a025ed4..26e5465 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -623,14 +623,14 @@ expected_token(parser_t *parser, token_t *token, token_kind_t expected_kind)
         fprintf(stderr,
                 "%s:%lu:%lu: error: got '" SV_FMT "' token but expect <%s>\n",
                 parser->file_path,
-                token->location.row + 1,
-                (token->location.offset - token->location.bol) + 1,
+                token->cur.row + 1,
+                (token->cur.offset - token->cur.bol) + 1,
                 SV_ARG(token->value),
                 token_kind_to_cstr(expected_kind));
 
         string_view_t line = lexer_get_token_line(parser->lexer, token);
         fprintf(stderr, "" SV_FMT "\n", SV_ARG(line));
-        fprintf(stderr, "%*s\n", (int)(token->location.offset - token->location.bol + 1), "^");
+        fprintf(stderr, "%*s\n", (int)(token->cur.offset - token->cur.bol + 1), "^");
 
         exit(EXIT_FAILURE);
     }

base-commit: 9a9b1e51387cc60eb2a388713431f659cf4703c9
-- 
2.46.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-10-04 16:06 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-10-04 17:51 [PATCH olang v1] lexer: add lexer cursor abstraction Johnny Richard
2024-10-04 15:52 ` [olang/patches/.build.yml] build success builds.sr.ht
2024-10-04 16:05 ` [PATCH olang v1] lexer: add lexer cursor abstraction Carlos Maniero

Code repositories for project(s) associated with this public inbox

	https://git.johnnyrichard.com/olang.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox