From mboxrd@z Thu Jan 1 00:00:00 1970 Authentication-Results: mail-a.sr.ht; dkim=pass header.d=johnnyrichard.com header.i=@johnnyrichard.com Received: from out-170.mta1.migadu.com (out-170.mta1.migadu.com [IPv6:2001:41d0:203:375::aa]) by mail-a.sr.ht (Postfix) with ESMTPS id 8168520330 for <~johnnyrichard/olang-devel@lists.sr.ht>; Wed, 28 Feb 2024 18:10:22 +0000 (UTC) X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=johnnyrichard.com; s=key1; t=1709143822; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=CGaUnmHR0KrUHJic0EeT37RxGn5JJ/g6Wea+r+8fTX0=; b=HSZGTsdytJjJk8nlTR2XSl9YAdiP9z+066rS2NrbfWTdXjvHvetlyWMvQvIkPz6rcYYGu8 q7EBYSzPdXA+blXcMC6IWvRdJxp/+nGzH3aumoX72GYqtMtqqLY3sIqmn8zSmNX+C/cvH8 H4XLbGW5kkeKGeU6oKYRMbXQUg9lRQOeS7Pk5C4F/SZjeedQhem8vxf9QG4BYWjOEgViaV sCvmDnSgRUD/70fIJsbGqZue/bJJ94bsg7X+OqSHjgKMV6HTtTWc17/zMpypYYKQQ35ry0 xFr3fjDeTWpvE0ZXHjcv6JkeETWjEiyEGfE3383m15yxLGgF7AJWuAS4YhjmKQ== From: Johnny Richard To: ~johnnyrichard/olang-devel@lists.sr.ht Cc: Johnny Richard Subject: [PATCH olang v1 4/4] parser: create simplified parser for tiny AST Date: Wed, 28 Feb 2024 20:04:21 +0100 Message-ID: <20240228190956.78191-5-johnny@johnnyrichard.com> In-Reply-To: <20240228190956.78191-1-johnny@johnnyrichard.com> References: <20240228190956.78191-1-johnny@johnnyrichard.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Migadu-Flow: FLOW_OUT X-TUID: GtNrxH5LdKJX This commit introduces a simple and restricted parser designed to handle a small program structure. Its purpose is to lay the foundation for future optimizations. Error handling during syntax analysis is rudimentary. If an error occurs, it will be printed, and the program will abort without further parsing to detect additional syntax errors. Additionally, it's important to note that semantic analysis will be conducted at a later stage in the compiler pipeline. As only u32 type is currently implemented, a separate type checker will not be developed. Consequently, the AST generated during syntax analysis can be directly passed to the backend. Signed-off-by: Johnny Richard --- src/ast.c | 79 ++++++++++++++++ src/ast.h | 101 ++++++++++++++++++++ src/lexer.c | 16 ++++ src/lexer.h | 4 + src/parser.c | 193 +++++++++++++++++++++++++++++++++++++++ src/parser.h | 38 ++++++++ tests/unit/parser_test.c | 88 ++++++++++++++++++ 7 files changed, 519 insertions(+) create mode 100644 src/ast.c create mode 100644 src/ast.h create mode 100644 src/parser.c create mode 100644 src/parser.h create mode 100644 tests/unit/parser_test.c diff --git a/src/ast.c b/src/ast.c new file mode 100644 index 0000000..ad3124d --- /dev/null +++ b/src/ast.c @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2024 olang maintainers + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "arena.h" +#include "ast.h" +#include "string_view.h" + +ast_node_t * +ast_make_node_fn_def(arena_t *arena, string_view_t identifier, type_t return_type, ast_node_t *block) +{ + ast_node_t *node_fn_def = (ast_node_t *)arena_alloc(arena, sizeof(ast_node_t)); + assert(node_fn_def); + + node_fn_def->kind = AST_NODE_FN_DEF; + ast_fn_definition_t *fn_def = &node_fn_def->data.as_fn_def; + + fn_def->identifier = identifier; + fn_def->return_type = return_type; + fn_def->block = block; + + return node_fn_def; +} + +ast_node_t * +ast_make_node_literal_u32(arena_t *arena, uint32_t value) +{ + ast_node_t *node_literal = (ast_node_t *)arena_alloc(arena, sizeof(ast_node_t)); + assert(node_literal); + + node_literal->kind = AST_NODE_LITERAL; + node_literal->data.as_literal.kind = AST_LITERAL_U32; + node_literal->data.as_literal.value.as_u32 = value; + + return node_literal; +} + +ast_node_t * +ast_make_node_return_stmt(arena_t *arena) +{ + ast_node_t *node_return_stmt = (ast_node_t *)arena_alloc(arena, sizeof(ast_node_t)); + assert(node_return_stmt); + + node_return_stmt->kind = AST_NODE_RETURN_STMT; + + return node_return_stmt; +} + +ast_node_t * +ast_make_node_block(arena_t *arena) +{ + ast_node_t *node_block = (ast_node_t *)arena_alloc(arena, sizeof(ast_node_t)); + assert(node_block); + + node_block->kind = AST_NODE_BLOCK; + + node_block->data.as_block.nodes = (list_t *)arena_alloc(arena, sizeof(list_t)); + assert(node_block->data.as_block.nodes); + + list_init(node_block->data.as_block.nodes, arena); + + return node_block; +} diff --git a/src/ast.h b/src/ast.h new file mode 100644 index 0000000..b80c067 --- /dev/null +++ b/src/ast.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2024 olang maintainers + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#ifndef AST_H +#define AST_H + +#include + +#include "arena.h" +#include "list.h" +#include "string_view.h" + +typedef struct ast_node ast_node_t; + +typedef enum +{ + AST_NODE_BLOCK, + AST_NODE_FN_DEF, + AST_NODE_RETURN_STMT, + AST_NODE_LITERAL, + AST_NODE_UNKNOWN +} ast_node_kind_t; + +typedef enum +{ + TYPE_U32 +} type_t; + +typedef struct ast_block +{ + list_t *nodes; +} ast_block_t; + +typedef struct ast_fn_definition +{ + string_view_t identifier; + type_t return_type; + ast_node_t *block; +} ast_fn_definition_t; + +typedef enum +{ + AST_LITERAL_U32 +} ast_literal_kind_t; + +typedef union +{ + uint32_t as_u32; +} ast_literal_value_t; + +typedef struct ast_literal +{ + ast_literal_kind_t kind; + ast_literal_value_t value; +} ast_literal_t; + +typedef struct ast_return_stmt +{ + ast_node_t *data; +} ast_return_stmt_t; + +typedef union +{ + ast_fn_definition_t as_fn_def; + ast_literal_t as_literal; + ast_block_t as_block; + ast_return_stmt_t as_return_stmt; +} ast_node_data_t; + +typedef struct ast_node +{ + ast_node_kind_t kind; + ast_node_data_t data; +} ast_node_t; + +ast_node_t * +ast_make_node_fn_def(arena_t *arena, string_view_t identifier, type_t return_type, ast_node_t *block); + +ast_node_t * +ast_make_node_literal_u32(arena_t *arena, uint32_t value); + +ast_node_t * +ast_make_node_return_stmt(arena_t *arena); + +ast_node_t * +ast_make_node_block(arena_t *arena); + +#endif /* AST_H */ diff --git a/src/lexer.c b/src/lexer.c index c7756a6..e9e97d4 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -19,6 +19,7 @@ #include #include #include +#include void lexer_init(lexer_t *lexer, string_view_t source) @@ -255,3 +256,18 @@ lexer_lookahead(lexer_t *lexer, token_t *token, size_t n) lexer->row = previous_row; lexer->bol = previous_bol; } + +void +lexer_print_token_highlight(lexer_t *lexer, token_t *token, FILE *stream) +{ + size_t offset = token->location.bol; + char *str = lexer->source.chars + offset; + + size_t i = 0; + while ((i + offset) < lexer->source.size && str[i] != '\n' && str[i] != 0) { + ++i; + } + string_view_t line = { .chars = str, .size = i }; + fprintf(stream, "" SV_FMT "\n", SV_ARG(line)); + fprintf(stream, "%*s\n", (int)(token->location.offset - token->location.bol + 1), "^"); +} diff --git a/src/lexer.h b/src/lexer.h index 729c957..d836b91 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -19,6 +19,7 @@ #include "string_view.h" #include +#include typedef struct lexer { @@ -77,4 +78,7 @@ lexer_lookahead(lexer_t *lexer, token_t *token, size_t n); char * token_kind_to_cstr(token_kind_t kind); +void +lexer_print_token_highlight(lexer_t *lexer, token_t *token, FILE *stream); + #endif /* LEXER_H */ diff --git a/src/parser.c b/src/parser.c new file mode 100644 index 0000000..f50b61a --- /dev/null +++ b/src/parser.c @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2024 olang maintainers + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include "lexer.h" +#include "parser.h" + +static bool +skip_expected_token(parser_t *parser, token_kind_t expected_kind); + +static bool +expected_token(parser_t *parser, token_t *token, token_kind_t kind); + +static bool +parser_parse_type(parser_t *parser, type_t *type); + +static ast_node_t * +parser_parse_block(parser_t *parser); + +static void +skip_line_feeds(lexer_t *lexer); + +void +parser_init(parser_t *parser, lexer_t *lexer, arena_t *arena, char *file_path) +{ + assert(parser && "parser is required"); + assert(lexer && "lexer is required"); + assert(file_path && "file_path is required"); + parser->lexer = lexer; + parser->arena = arena; + parser->file_path = file_path; +} + +ast_node_t * +parser_parse_fn_definition(parser_t *parser) +{ + if (!skip_expected_token(parser, TOKEN_FN)) + return NULL; + + skip_line_feeds(parser->lexer); + + token_t fn_name_token; + + if (!expected_token(parser, &fn_name_token, TOKEN_IDENTIFIER)) + return NULL; + + skip_line_feeds(parser->lexer); + + if (!skip_expected_token(parser, TOKEN_OPAREN)) + return NULL; + + skip_line_feeds(parser->lexer); + + if (!skip_expected_token(parser, TOKEN_CPAREN)) + return NULL; + + skip_line_feeds(parser->lexer); + + if (!skip_expected_token(parser, TOKEN_COLON)) + return NULL; + + skip_line_feeds(parser->lexer); + + type_t fn_return_type; + if (!parser_parse_type(parser, &fn_return_type)) { + return NULL; + } + + skip_line_feeds(parser->lexer); + + ast_node_t *block = parser_parse_block(parser); + if (block == NULL) { + return NULL; + } + + return ast_make_node_fn_def(parser->arena, fn_name_token.value, fn_return_type, block); +} + +static bool +parser_parse_type(parser_t *parser, type_t *type) +{ + token_t token; + + if (!expected_token(parser, &token, TOKEN_IDENTIFIER)) { + return false; + } + + if (string_view_eq_to_cstr(token.value, "u32")) { + *type = TYPE_U32; + return true; + } + + return false; +} + +static ast_node_t * +parser_parse_block(parser_t *parser) +{ + token_t number_token; + if (!skip_expected_token(parser, TOKEN_OCURLY)) { + return false; + } + + skip_line_feeds(parser->lexer); + + ast_node_t *node_block = ast_make_node_block(parser->arena); + + if (!skip_expected_token(parser, TOKEN_RETURN)) { + return false; + } + + ast_node_t *node_return_stmt = ast_make_node_return_stmt(parser->arena); + assert(node_return_stmt); + + if (!expected_token(parser, &number_token, TOKEN_NUMBER)) { + return false; + } + + ast_node_t *literal_node = ast_make_node_literal_u32(parser->arena, string_view_to_u32(number_token.value)); + assert(literal_node); + + node_return_stmt->data.as_return_stmt.data = literal_node; + + list_append(node_block->data.as_block.nodes, node_return_stmt); + + if (!skip_expected_token(parser, TOKEN_LF)) { + return false; + } + + skip_line_feeds(parser->lexer); + + if (!skip_expected_token(parser, TOKEN_CCURLY)) { + return false; + } + + return node_block; +} + +static bool +skip_expected_token(parser_t *parser, token_kind_t expected_kind) +{ + token_t token; + return expected_token(parser, &token, expected_kind); +} + +static bool +expected_token(parser_t *parser, token_t *token, token_kind_t expected_kind) +{ + lexer_next_token(parser->lexer, token); + + if (token->kind != expected_kind) { + fprintf(stderr, + "%s:%lu:%lu: error: got <%s> token but expect <%s>\n", + parser->file_path, + token->location.row + 1, + (token->location.offset - token->location.bol) + 1, + token_kind_to_cstr(token->kind), + token_kind_to_cstr(expected_kind)); + lexer_print_token_highlight(parser->lexer, token, stderr); + return false; + } + return true; +} + +static void +skip_line_feeds(lexer_t *lexer) +{ + token_t token; + lexer_peek_next(lexer, &token); + + while (token.kind == TOKEN_LF) { + lexer_next_token(lexer, &token); + lexer_peek_next(lexer, &token); + } +} diff --git a/src/parser.h b/src/parser.h new file mode 100644 index 0000000..3f1a00b --- /dev/null +++ b/src/parser.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2024 olang maintainers + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#ifndef PARSER_H +#define PARSER_H + +#include "arena.h" +#include "ast.h" +#include "lexer.h" + +typedef struct parser +{ + lexer_t *lexer; + arena_t *arena; + // TODO: we should define a better place to file_path string + char *file_path; +} parser_t; + +void +parser_init(parser_t *parser, lexer_t *lexer, arena_t *arena, char *file_path); + +ast_node_t * +parser_parse_fn_definition(parser_t *parser); + +#endif /* PARSER_H */ diff --git a/tests/unit/parser_test.c b/tests/unit/parser_test.c new file mode 100644 index 0000000..32ebc8e --- /dev/null +++ b/tests/unit/parser_test.c @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2024 olang maintainers + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#define MUNIT_ENABLE_ASSERT_ALIASES + +#include "arena.h" +#include "ast.h" +#include "lexer.h" +#include "list.h" +#include "munit.h" +#include "parser.h" +#include "string_view.h" + +#define ARENA_CAPACITY (1024 * 1024) + +static MunitResult +parse_fn_definition_test(const MunitParameter params[], void *user_data_or_fixture) +{ + arena_t arena = arena_new(ARENA_CAPACITY); + + char *file_path = "main.0"; + char *source_value = "fn main(): u32 {\n\treturn 69\n}"; + + lexer_t lexer; + string_view_t source = { .chars = source_value, .size = strlen(source_value) }; + lexer_init(&lexer, source); + + parser_t parser; + parser_init(&parser, &lexer, &arena, file_path); + + ast_node_t *node_fn_def = parser_parse_fn_definition(&parser); + assert_not_null(node_fn_def); + assert_uint(node_fn_def->kind, ==, AST_NODE_FN_DEF); + + ast_fn_definition_t *fn = &node_fn_def->data.as_fn_def; + assert_memory_equal(fn->identifier.size, fn->identifier.chars, "main"); + assert_uint(fn->return_type, ==, TYPE_U32); + + ast_node_t *block = fn->block; + assert_not_null(block); + + assert_uint(block->kind, ==, AST_NODE_BLOCK); + assert_uint(list_size(block->data.as_block.nodes), ==, 1); + list_item_t *block_item = list_get(block->data.as_block.nodes, 0); + assert_not_null(block_item); + assert_not_null(block_item->value); + + ast_node_t *node = (ast_node_t *)block_item->value; + assert_not_null(node); + assert_uint(node->kind, ==, AST_NODE_RETURN_STMT); + + ast_node_t *number_node = node->data.as_return_stmt.data; + assert_not_null(number_node); + assert_uint(number_node->kind, ==, AST_NODE_LITERAL); + assert_uint(number_node->data.as_literal.kind, ==, AST_LITERAL_U32); + assert_uint(number_node->data.as_literal.value.as_u32, ==, 69); + + arena_free(&arena); + + return MUNIT_OK; +} + +static MunitTest tests[] = { + { "/parse_fn_definition", parse_fn_definition_test, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL }, + { NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL } +}; + +static const MunitSuite suite = { "/parser", tests, NULL, 1, MUNIT_SUITE_OPTION_NONE }; + +int +main(int argc, char *argv[]) +{ + return munit_suite_main(&suite, NULL, argc, argv); + return EXIT_SUCCESS; +} -- 2.43.2