/*
* Copyright (C) 2024 olang maintainers
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include "lexer.h"
#include
#include
#include
#include
void
lexer_init(lexer_t *lexer, source_code_t src)
{
assert(lexer);
lexer->src = src;
lexer->cur.offset = 0;
lexer->cur.row = 0;
lexer->cur.bol = 0;
}
static char
lexer_current_char(lexer_t *lexer);
static void
lexer_skip_char(lexer_t *lexer);
static bool
lexer_is_eof(lexer_t *lexer);
static bool
lexer_is_not_eof(lexer_t *lexer);
static bool
_isspace(char c);
static void
lexer_init_char_value_token(lexer_t *lexer, token_t *token, token_kind_t kind);
static void
lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, lexer_cursor_t cur);
static void
lexer_init_eof_token(lexer_t *lexer, token_t *token);
static token_kind_t
lexer_str_to_token_kind(string_view_t text);
void
lexer_next_token(lexer_t *lexer, token_t *token)
{
if (lexer_is_eof(lexer)) {
lexer_init_eof_token(lexer, token);
return;
}
char current_char = lexer_current_char(lexer);
if (_isspace(current_char)) {
while (_isspace(current_char) && lexer_is_not_eof(lexer)) {
lexer_skip_char(lexer);
current_char = lexer_current_char(lexer);
}
}
while (lexer_is_not_eof(lexer)) {
if (current_char == '#') {
while (current_char != '\n' && lexer_is_not_eof(lexer)) {
lexer_skip_char(lexer);
current_char = lexer_current_char(lexer);
}
}
if (isalpha(current_char)) {
lexer_cursor_t start_cur = lexer->cur;
while (isalnum(current_char) && lexer_is_not_eof(lexer)) {
lexer_skip_char(lexer);
current_char = lexer_current_char(lexer);
}
string_view_t text = {
.chars = lexer->src.code.chars + start_cur.offset,
.size = lexer->cur.offset - start_cur.offset,
};
lexer_init_str_value_token(lexer, token, lexer_str_to_token_kind(text), start_cur);
return;
}
if (isdigit(current_char)) {
lexer_cursor_t start_cur = lexer->cur;
while (isdigit(current_char) && lexer_is_not_eof(lexer)) {
lexer_skip_char(lexer);
current_char = lexer_current_char(lexer);
}
lexer_init_str_value_token(lexer, token, TOKEN_NUMBER, start_cur);
return;
}
switch (current_char) {
case '=': {
lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
if (lexer_current_char(lexer) == '=') {
lexer_skip_char(lexer);
lexer_init_str_value_token(lexer, token, TOKEN_CMP_EQ, start_cur);
return;
}
lexer_init_str_value_token(lexer, token, TOKEN_EQ, start_cur);
return;
}
case '!': {
lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
if (lexer_current_char(lexer) == '=') {
lexer_skip_char(lexer);
lexer_init_str_value_token(lexer, token, TOKEN_CMP_NEQ, start_cur);
return;
}
lexer_init_str_value_token(lexer, token, TOKEN_BANG, start_cur);
return;
}
case '&': {
lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
if (lexer_current_char(lexer) == '&') {
lexer_skip_char(lexer);
lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_AND, start_cur);
return;
}
lexer_init_str_value_token(lexer, token, TOKEN_AND, start_cur);
return;
}
case '|': {
lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
if (lexer_current_char(lexer) == '|') {
lexer_skip_char(lexer);
lexer_init_str_value_token(lexer, token, TOKEN_LOGICAL_OR, start_cur);
return;
}
lexer_init_str_value_token(lexer, token, TOKEN_PIPE, start_cur);
return;
}
case '<': {
lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
switch (lexer_current_char(lexer)) {
case '<': {
lexer_skip_char(lexer);
lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_LSHIFT, start_cur);
return;
}
case '=': {
lexer_skip_char(lexer);
lexer_init_str_value_token(lexer, token, TOKEN_CMP_LEQ, start_cur);
return;
}
default: {
lexer_init_str_value_token(lexer, token, TOKEN_LT, start_cur);
return;
}
}
}
case '>': {
lexer_cursor_t start_cur = lexer->cur;
lexer_skip_char(lexer);
switch (lexer_current_char(lexer)) {
case '>': {
lexer_skip_char(lexer);
lexer_init_str_value_token(lexer, token, TOKEN_BITWISE_RSHIFT, start_cur);
return;
}
case '=': {
lexer_skip_char(lexer);
lexer_init_str_value_token(lexer, token, TOKEN_CMP_GEQ, start_cur);
return;
}
default: {
lexer_init_str_value_token(lexer, token, TOKEN_GT, start_cur);
return;
}
}
}
case '^': {
lexer_init_char_value_token(lexer, token, TOKEN_CIRCUMFLEX);
lexer_skip_char(lexer);
return;
}
case '%': {
lexer_init_char_value_token(lexer, token, TOKEN_PERCENT);
lexer_skip_char(lexer);
return;
}
case '(': {
lexer_init_char_value_token(lexer, token, TOKEN_OPAREN);
lexer_skip_char(lexer);
return;
}
case ')': {
lexer_init_char_value_token(lexer, token, TOKEN_CPAREN);
lexer_skip_char(lexer);
return;
}
case ':': {
lexer_init_char_value_token(lexer, token, TOKEN_COLON);
lexer_skip_char(lexer);
return;
}
case ',': {
lexer_init_char_value_token(lexer, token, TOKEN_COMMA);
lexer_skip_char(lexer);
return;
}
case '{': {
lexer_init_char_value_token(lexer, token, TOKEN_OCURLY);
lexer_skip_char(lexer);
return;
}
case '}': {
lexer_init_char_value_token(lexer, token, TOKEN_CCURLY);
lexer_skip_char(lexer);
return;
}
case '+': {
lexer_init_char_value_token(lexer, token, TOKEN_PLUS);
lexer_skip_char(lexer);
return;
}
case '-': {
lexer_init_char_value_token(lexer, token, TOKEN_DASH);
lexer_skip_char(lexer);
return;
}
case '*': {
lexer_init_char_value_token(lexer, token, TOKEN_STAR);
lexer_skip_char(lexer);
return;
}
case '~': {
lexer_init_char_value_token(lexer, token, TOKEN_TILDE);
lexer_skip_char(lexer);
return;
}
case '/': {
lexer_init_char_value_token(lexer, token, TOKEN_SLASH);
lexer_skip_char(lexer);
return;
}
case '\n': {
lexer_init_char_value_token(lexer, token, TOKEN_LF);
lexer_skip_char(lexer);
return;
}
default: {
lexer_init_char_value_token(lexer, token, TOKEN_UNKNOWN);
lexer_skip_char(lexer);
return;
}
}
}
if (lexer_is_eof(lexer)) {
lexer_init_eof_token(lexer, token);
return;
}
}
static char *token_kind_str_table[] = {
[TOKEN_UNKNOWN] = "unknown",
[TOKEN_ID] = "identifier",
[TOKEN_NUMBER] = "number",
[TOKEN_FN] = "fn",
[TOKEN_RETURN] = "return",
[TOKEN_IF] = "if",
[TOKEN_ELSE] = "else",
[TOKEN_WHILE] = "while",
[TOKEN_VAR] = "var",
[TOKEN_LF] = "line_feed",
[TOKEN_OPAREN] = "(",
[TOKEN_CPAREN] = ")",
[TOKEN_COLON] = ":",
[TOKEN_COMMA] = ",",
[TOKEN_OCURLY] = "{",
[TOKEN_CCURLY] = "}",
[TOKEN_PLUS] = "+",
[TOKEN_DASH] = "-",
[TOKEN_STAR] = "*",
[TOKEN_SLASH] = "/",
[TOKEN_TILDE] = "~",
[TOKEN_EQ] = "=",
[TOKEN_CMP_EQ] = "==",
[TOKEN_BANG] = "!",
[TOKEN_CMP_NEQ] = "!=",
[TOKEN_LT] = "<",
[TOKEN_GT] = ">",
[TOKEN_CMP_LEQ] = "<=",
[TOKEN_CMP_GEQ] = ">=",
[TOKEN_PERCENT] = "%",
[TOKEN_BITWISE_LSHIFT] = "<<",
[TOKEN_BITWISE_RSHIFT] = ">>",
[TOKEN_CIRCUMFLEX] = "^",
[TOKEN_PIPE] = "|",
[TOKEN_LOGICAL_OR] = "||",
[TOKEN_AND] = "&",
[TOKEN_LOGICAL_AND] = "&&",
[TOKEN_EOF] = "EOF",
};
char *
token_kind_to_cstr(token_kind_t kind)
{
assert(kind < sizeof(token_kind_str_table));
return token_kind_str_table[kind];
}
bool
token_kind_is_binary_op(token_kind_t kind)
{
switch (kind) {
case TOKEN_EQ:
case TOKEN_PLUS:
case TOKEN_DASH:
case TOKEN_SLASH:
case TOKEN_STAR:
case TOKEN_PERCENT:
case TOKEN_BITWISE_LSHIFT:
case TOKEN_BITWISE_RSHIFT:
case TOKEN_LT:
case TOKEN_CMP_LEQ:
case TOKEN_GT:
case TOKEN_CMP_GEQ:
case TOKEN_CMP_EQ:
case TOKEN_CMP_NEQ:
case TOKEN_AND:
case TOKEN_CIRCUMFLEX:
case TOKEN_PIPE:
case TOKEN_LOGICAL_AND:
case TOKEN_LOGICAL_OR:
return true;
default:
return false;
}
}
static char
lexer_current_char(lexer_t *lexer)
{
return lexer->src.code.chars[lexer->cur.offset];
}
static void
lexer_skip_char(lexer_t *lexer)
{
assert(lexer->cur.offset < lexer->src.code.size);
if (lexer_current_char(lexer) == '\n') {
lexer->cur.row++;
lexer->cur.bol = ++lexer->cur.offset;
} else {
lexer->cur.offset++;
}
}
static bool
lexer_is_eof(lexer_t *lexer)
{
return lexer->cur.offset >= lexer->src.code.size;
}
static bool
lexer_is_not_eof(lexer_t *lexer)
{
return !lexer_is_eof(lexer);
}
static bool
_isspace(char c)
{
return c != '\n' && isspace(c);
}
static void
lexer_init_char_value_token(lexer_t *lexer, token_t *token, token_kind_t kind)
{
string_view_t str = {
.chars = lexer->src.code.chars + lexer->cur.offset,
.size = 1,
};
*token = (token_t){
.kind = kind,
.value = str,
.loc =
(token_loc_t){
.src = lexer->src,
.cur = lexer->cur,
},
};
}
static void
lexer_init_str_value_token(lexer_t *lexer, token_t *token, token_kind_t kind, lexer_cursor_t cur)
{
string_view_t str = {
.chars = lexer->src.code.chars + cur.offset,
.size = lexer->cur.offset - cur.offset,
};
*token = (token_t){
.kind = kind,
.value = str,
.loc =
(token_loc_t){
.src = lexer->src,
.cur = cur,
},
};
}
static void
lexer_init_eof_token(lexer_t *lexer, token_t *token)
{
string_view_t str = { 0 };
*token = (token_t){
.kind = TOKEN_EOF,
.value = str,
.loc =
(token_loc_t){
.src = lexer->src,
.cur = lexer->cur,
},
};
}
static token_kind_t
lexer_str_to_token_kind(string_view_t text)
{
if (string_view_eq_to_cstr(text, "if")) {
return TOKEN_IF;
}
if (string_view_eq_to_cstr(text, "else")) {
return TOKEN_ELSE;
}
if (string_view_eq_to_cstr(text, "while")) {
return TOKEN_WHILE;
}
if (string_view_eq_to_cstr(text, "var")) {
return TOKEN_VAR;
}
if (string_view_eq_to_cstr(text, "return")) {
return TOKEN_RETURN;
}
if (string_view_eq_to_cstr(text, "fn")) {
return TOKEN_FN;
}
return TOKEN_ID;
}
void
lexer_peek_next(lexer_t *lexer, token_t *token)
{
lexer_lookahead(lexer, token, 1);
}
void
lexer_lookahead(lexer_t *lexer, token_t *token, size_t n)
{
lexer_cursor_t previous_cur = lexer->cur;
for (size_t i = 0; i < n; ++i) {
lexer_next_token(lexer, token);
}
lexer->cur = previous_cur;
}
string_view_t
token_loc_to_line(token_loc_t loc)
{
size_t offset = loc.cur.bol;
string_view_t line = {
.chars = loc.src.code.chars + offset,
.size = 0,
};
while ((line.size + offset) < loc.src.code.size && line.chars[line.size] != '\n' && line.chars[line.size] != 0) {
++line.size;
}
return line;
}
size_t
token_loc_to_lineno(token_loc_t loc)
{
return loc.cur.row + 1;
}
size_t
token_loc_to_colno(token_loc_t loc)
{
return loc.cur.offset - loc.cur.bol + 1;
}