* [PATCH olang v1] lexer: add lexer cursor abstraction
@ 2024-10-04 17:51 Johnny Richard
2024-10-04 15:52 ` [olang/patches/.build.yml] build success builds.sr.ht
2024-10-04 16:05 ` [PATCH olang v1] lexer: add lexer cursor abstraction Carlos Maniero
0 siblings, 2 replies; 3+ messages in thread
From: Johnny Richard @ 2024-10-04 17:51 UTC (permalink / raw)
To: ~johnnyrichard/olang-devel; +Cc: Johnny Richard
In order to simplify the navigation and lexer state we are using a
common structure between tokens and lexer.
Signed-off-by: Johnny Richard <johnny@johnnyrichard.com>
---
src/lexer.c | 96 ++++++++++++++++++++++++----------------------------
src/lexer.h | 18 +++++-----
src/main.c | 4 +--
src/parser.c | 6 ++--
4 files changed, 58 insertions(+), 66 deletions(-)
diff --git a/src/lexer.c b/src/lexer.c
index 6fe0151..8de40a0 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -26,9 +26,9 @@ lexer_init(lexer_t *lexer, string_view_t source)
{
assert(lexer);
lexer->source = source;
- lexer->offset = 0;
- lexer->row = 0;
- lexer->bol = 0;
+ lexer->cur.offset = 0;
+ lexer->cur.row = 0;
+ lexer->cur.bol = 0;
}
static char
@@ -50,7 +50,7 @@ static void
lexer_init_char_value_token(lexer_t *lexer, token_t *token, token_kind_t kind);
static void
-lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, size_t start_offset);
+lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, lexer_cursor_t cur);
static void
lexer_init_eof_token(lexer_t *lexer, token_t *token);
@@ -84,120 +84,121 @@ lexer_next_token(lexer_t *lexer, token_t *token)
}
if (isalpha(current_char)) {
- size_t start_offset = lexer->offset;
+ lexer_cursor_t start_cur = lexer->cur;
while (isalnum(current_char) && lexer_is_not_eof(lexer)) {
lexer_skip_char(lexer);
current_char = lexer_current_char(lexer);
}
- string_view_t text = { .chars = lexer->source.chars + start_offset, .size = lexer->offset - start_offset };
+ string_view_t text = { .chars = lexer->source.chars + start_cur.offset,
+ .size = lexer->cur.offset - start_cur.offset };
- lexer_init_str_value_token(lexer, token, lexer_str_to_token_kind(text), start_offset);
+ lexer_init_str_value_token(lexer, token, lexer_str_to_token_kind(text), start_cur);
return;
}
if (isdigit(current_char)) {
- size_t start_offset = lexer->offset;
+ lexer_cursor_t start_cur = lexer->cur;
while (isdigit(current_char) && lexer_is_not_eof(lexer)) {
lexer_skip_char(lexer);
current_char = lexer_current_char(lexer);
}
- lexer_init_str_value_token(lexer, token, TOKEN_NUMBER, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_NUMBER, start_cur);
return;
}
switch (current_char) {
case '=': {
- size_t start_offset = lexer->offset;
+ lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
if (lexer_current_char(lexer) == '=') {
lexer_skip_char(lexer);
- lexer_init_str_value_token(lexer, token, TOKEN_CMP_EQ, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_CMP_EQ, start_cur);
return;
}
- lexer_init_str_value_token(lexer, token, TOKEN_EQ, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_EQ, start_cur);
return;
}
case '!': {
- size_t start_offset = lexer->offset;
+ lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
if (lexer_current_char(lexer) == '=') {
lexer_skip_char(lexer);
- lexer_init_str_value_token(lexer, token, TOKEN_CMP_NEQ, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_CMP_NEQ, start_cur);
return;
}
- lexer_init_str_value_token(lexer, token, TOKEN_BANG, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_BANG, start_cur);
return;
}
case '&': {
- size_t start_offset = lexer->offset;
+ lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
if (lexer_current_char(lexer) == '&') {
lexer_skip_char(lexer);
- lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_AND, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_AND, start_cur);
return;
}
- lexer_init_str_value_token(lexer, token, TOKEN_AND, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_AND, start_cur);
return;
}
case '|': {
- size_t start_offset = lexer->offset;
+ lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
if (lexer_current_char(lexer) == '|') {
lexer_skip_char(lexer);
- lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_OR, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_OR, start_cur);
return;
}
- lexer_init_str_value_token(lexer, token, TOKEN_PIPE, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_PIPE, start_cur);
return;
}
case '<': {
- size_t start_offset = lexer->offset;
+ lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
switch (lexer_current_char(lexer)) {
case '<': {
lexer_skip_char(lexer);
- lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_LSHIFT, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_LSHIFT, start_cur);
return;
}
case '=': {
lexer_skip_char(lexer);
- lexer_init_str_value_token(lexer, token, TOKEN_CMP_LEQ, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_CMP_LEQ, start_cur);
return;
}
default: {
- lexer_init_str_value_token(lexer, token, TOKEN_LT, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_LT, start_cur);
return;
}
}
}
case '>': {
- size_t start_offset = lexer->offset;
+ lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
switch (lexer_current_char(lexer)) {
case '>': {
lexer_skip_char(lexer);
- lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_RSHIFT, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_RSHIFT, start_cur);
return;
}
case '=': {
lexer_skip_char(lexer);
- lexer_init_str_value_token(lexer, token, TOKEN_CMP_GEQ, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_CMP_GEQ, start_cur);
return;
}
default: {
- lexer_init_str_value_token(lexer, token, TOKEN_GT, start_offset);
+ lexer_init_str_value_token(lexer, token, TOKEN_GT, start_cur);
return;
}
}
@@ -358,25 +359,25 @@ token_kind_is_binary_op(token_kind_t kind)
static char
lexer_current_char(lexer_t *lexer)
{
- return lexer->source.chars[lexer->offset];
+ return lexer->source.chars[lexer->cur.offset];
}
static void
lexer_skip_char(lexer_t *lexer)
{
- assert(lexer->offset < lexer->source.size);
+ assert(lexer->cur.offset < lexer->source.size);
if (lexer_current_char(lexer) == '\n') {
- lexer->row++;
- lexer->bol = ++lexer->offset;
+ lexer->cur.row++;
+ lexer->cur.bol = ++lexer->cur.offset;
} else {
- lexer->offset++;
+ lexer->cur.offset++;
}
}
static bool
lexer_is_eof(lexer_t *lexer)
{
- return lexer->offset >= lexer->source.size;
+ return lexer->cur.offset >= lexer->source.size;
}
static bool
@@ -394,25 +395,22 @@ _isspace(char c)
static void
lexer_init_char_value_token(lexer_t *lexer, token_t *token, token_kind_t kind)
{
- string_view_t str = { .chars = lexer->source.chars + lexer->offset, .size = 1 };
- token_loc_t location = { .offset = lexer->offset, .row = lexer->row, .bol = lexer->bol };
- *token = (token_t){ .kind = kind, .value = str, .location = location };
+ string_view_t str = { .chars = lexer->source.chars + lexer->cur.offset, .size = 1 };
+ *token = (token_t){ .kind = kind, .value = str, .cur = lexer->cur };
}
static void
-lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, size_t start_offset)
+lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, lexer_cursor_t cur)
{
- string_view_t str = { .chars = lexer->source.chars + start_offset, .size = lexer->offset - start_offset };
- token_loc_t location = { .offset = start_offset, .row = lexer->row, .bol = lexer->bol };
- *token = (token_t){ .kind = kind, .value = str, .location = location };
+ string_view_t str = { .chars = lexer->source.chars + cur.offset, .size = lexer->cur.offset - cur.offset };
+ *token = (token_t){ .kind = kind, .value = str, .cur = cur };
}
static void
lexer_init_eof_token(lexer_t *lexer, token_t *token)
{
string_view_t str = { 0 };
- token_loc_t location = { .offset = lexer->offset, .row = lexer->row, .bol = lexer->bol };
- *token = (token_t){ .kind = TOKEN_EOF, .value = str, .location = location };
+ *token = (token_t){ .kind = TOKEN_EOF, .value = str, .cur = lexer->cur };
}
static token_kind_t
@@ -450,23 +448,19 @@ lexer_peek_next(lexer_t *lexer, token_t *token)
void
lexer_lookahead(lexer_t *lexer, token_t *token, size_t n)
{
- size_t previous_offset = lexer->offset;
- size_t previous_row = lexer->row;
- size_t previous_bol = lexer->bol;
+ lexer_cursor_t previous_cur = lexer->cur;
for (size_t i = 0; i < n; ++i) {
lexer_next_token(lexer, token);
}
- lexer->offset = previous_offset;
- lexer->row = previous_row;
- lexer->bol = previous_bol;
+ lexer->cur = previous_cur;
}
string_view_t
lexer_get_token_line(lexer_t *lexer, token_t *token)
{
- size_t offset = token->location.bol;
+ size_t offset = token->cur.bol;
string_view_t line = { .chars = lexer->source.chars + offset, .size = 0 };
while ((line.size + offset) < lexer->source.size && line.chars[line.size] != '\n' && line.chars[line.size] != 0) {
diff --git a/src/lexer.h b/src/lexer.h
index 2746e3e..1aecb11 100644
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -21,12 +21,17 @@
#include <stdint.h>
#include <stdio.h>
-typedef struct lexer
+typedef struct lexer_cursor
{
- string_view_t source;
size_t offset;
size_t row;
size_t bol;
+} lexer_cursor_t;
+
+typedef struct lexer
+{
+ string_view_t source;
+ lexer_cursor_t cur;
} lexer_t;
typedef enum token_kind
@@ -79,18 +84,11 @@ typedef enum token_kind
TOKEN_EOF
} token_kind_t;
-typedef struct token_loc
-{
- size_t offset;
- size_t row;
- size_t bol;
-} token_loc_t;
-
typedef struct token
{
token_kind_t kind;
string_view_t value;
- token_loc_t location;
+ lexer_cursor_t cur;
} token_t;
void
diff --git a/src/main.c b/src/main.c
index 60b17bf..9d66455 100644
--- a/src/main.c
+++ b/src/main.c
@@ -246,7 +246,7 @@ print_token(char *file_path, token_t *token)
{
printf("%s:%lu:%lu: <%s>\n",
file_path,
- token->location.row + 1,
- (token->location.offset - token->location.bol) + 1,
+ token->cur.row + 1,
+ (token->cur.offset - token->cur.bol) + 1,
token_kind_to_cstr(token->kind));
}
diff --git a/src/parser.c b/src/parser.c
index a025ed4..26e5465 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -623,14 +623,14 @@ expected_token(parser_t *parser, token_t *token, token_kind_t expected_kind)
fprintf(stderr,
"%s:%lu:%lu: error: got '" SV_FMT "' token but expect <%s>\n",
parser->file_path,
- token->location.row + 1,
- (token->location.offset - token->location.bol) + 1,
+ token->cur.row + 1,
+ (token->cur.offset - token->cur.bol) + 1,
SV_ARG(token->value),
token_kind_to_cstr(expected_kind));
string_view_t line = lexer_get_token_line(parser->lexer, token);
fprintf(stderr, "" SV_FMT "\n", SV_ARG(line));
- fprintf(stderr, "%*s\n", (int)(token->location.offset - token->location.bol + 1), "^");
+ fprintf(stderr, "%*s\n", (int)(token->cur.offset - token->cur.bol + 1), "^");
exit(EXIT_FAILURE);
}
base-commit: 9a9b1e51387cc60eb2a388713431f659cf4703c9
--
2.46.0
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-10-04 16:06 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-10-04 17:51 [PATCH olang v1] lexer: add lexer cursor abstraction Johnny Richard
2024-10-04 15:52 ` [olang/patches/.build.yml] build success builds.sr.ht
2024-10-04 16:05 ` [PATCH olang v1] lexer: add lexer cursor abstraction Carlos Maniero
Code repositories for project(s) associated with this public inbox
https://git.johnnyrichard.com/olang.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox