script

script.git
git clone git://git.lenczewski.org/script.git
Log | Files | Refs

commit 5492185850c59c6ea42c385edc684ecdf7e32646
parent ea67d2dce57bf5c68942c320c78790c80d7acc36
Author: MikoĊ‚aj Lenczewski <mblenczewski@gmail.com>
Date:   Tue, 14 Jan 2025 23:39:50 +0000

Improve parser for vardecls and returns, implement basic vm stub

Improved parser to track proper type information throughout, as well as
to emit a trivial IR. Have yet to implement proper block scoping, or
support for blocks at all. Implement a very simple symbol table to track
global variables. Will need to implement support for scope-local stack
symbols.

VM stub has been extended to support evaluating our current testcases.
Supports simple loads and stores to heap memory, as well as basic push
and pop operations to stack memory, with trivial overrun and underrun
checking.

Diffstat:
Aexamples/expr.script | 1+
Mexamples/test.script | 7+++++--
Aexamples/vardecl.script | 2++
Alibscript/debug.c | 318+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlibscript/libscript.c | 551++++++++++++++++++++++++++++++++++++++-----------------------------------------
Mlibscript/libscript.h | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mlibscript/libscript_internal.h | 186++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Dlibscript/utils.c | 21---------------------
Mscriptvm/scriptvm.c | 316+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
9 files changed, 1161 insertions(+), 339 deletions(-)

diff --git a/examples/expr.script b/examples/expr.script @@ -0,0 +1 @@ +return 8 8 * 64 / 4 + 2 -; diff --git a/examples/test.script b/examples/test.script @@ -1,2 +1,5 @@ -x : u64 = 0; -return 42 1 + x *; +x : u64 = 10; +y : u64 = 30; +z : u64 = 20; + +return 10 1 + x - y * z /; diff --git a/examples/vardecl.script b/examples/vardecl.script @@ -0,0 +1,2 @@ +x : u64 = 42; +return x; diff --git a/libscript/debug.c b/libscript/debug.c @@ -0,0 +1,318 @@ +#include "libscript_internal.h" + +static char const * +dump_token_type_str(enum script_token_type type) +{ + static char const *type_to_str[] = { + [SCR_TOKEN_EOF] = "EOF", + [SCR_TOKEN_IDENT] = "IDENT", + [SCR_TOKEN_LITERAL_INT] = "LITERAL_INT", + [SCR_TOKEN_U64] = "U64", + [SCR_TOKEN_RETURN] = "RETURN", + [SCR_TOKEN_LPAREN] = "(", + [SCR_TOKEN_RPAREN] = ")", + [SCR_TOKEN_LBRACK] = "[", + [SCR_TOKEN_RBRACK] = "]", + [SCR_TOKEN_LBRACE] = "{", + [SCR_TOKEN_RBRACE] = "}", + [SCR_TOKEN_LANGLE] = "<", + [SCR_TOKEN_RANGLE] = ">", + [SCR_TOKEN_LSLASH] = "\\", + [SCR_TOKEN_RSLASH] = "/", + [SCR_TOKEN_COLON] = ":", + [SCR_TOKEN_SEMICOLON] = ";", + [SCR_TOKEN_DOT] = ".", + [SCR_TOKEN_COMMA] = ",", + [SCR_TOKEN_EQUALS] = "=", + [SCR_TOKEN_PLUS] = "+", + [SCR_TOKEN_MINUS] = "-", + [SCR_TOKEN_STAR] = "*", + }; + + return type_to_str[type]; +} + +static int +dump_token(struct script_token *token, char *buf, size_t cap) +{ + switch (token->type) { + case SCR_TOKEN_IDENT: + return snprintf(buf, cap, "Token {type: %s, ident: %" PRIu64 "}", + dump_token_type_str(token->type), token->ident.v); + + case SCR_TOKEN_LITERAL_INT: + return snprintf(buf, cap, "Token {type: %s, literal_int: %" PRIu64 "}", + dump_token_type_str(token->type), token->literal_int); + + default: + return snprintf(buf, cap, "Token {type: %s}", dump_token_type_str(token->type)); + } +} + +static void +dump_token_stream(struct compile_ctx *ctx) +{ + struct script_token *ptr = ctx->stream.ptr; + struct script_token *end = ctx->stream.ptr + ctx->stream.len; + + dbglog(ctx, "token stream: %zu tokens\n", ctx->stream.len); + + char buf[64]; + while (ptr < end) { + int written = dump_token(ptr, buf, sizeof buf); + assert(written); + + dbglog(ctx, "\t%.*s\n", written, buf); + + ptr++; + } + + dbglog(ctx, "\n"); +} + +static void +dump_typeinfo(struct compile_ctx *ctx, struct script_typeinfo *typeinfo, size_t indent) +{ +#define leader(indent) \ + for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + + switch (typeinfo->type) { + case SCR_TYPE_U64: { + static char const *type_str[] = { + [SCR_TYPE_U64] = "U64", + }; + + leader(indent) + dbglog(ctx, "Typeinfo { type: %s, size: %zu, alignment: %zu }\n", + type_str[typeinfo->type], typeinfo->size, typeinfo->alignment); + } break; + } + +#undef leader +} + +static void +dump_symbol(struct compile_ctx *ctx, struct script_symbol *sym, size_t indent) +{ +#define leader(indent) \ + for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + + char *str; + size_t len; + ident_pool_get(&ctx->ident_pool, sym->ident, &str, &len); + + switch (sym->type) { + case SCR_SYMBOL_VARIABLE: + leader(indent) + dbglog(ctx, "variable: global: %d, ident: %.*s, addr: 0x%lx\n", + sym->variable.parent == NULL, (int) len, str, sym->variable.addr); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, sym->variable.typeinfo, indent + 2); + break; + } + +#undef leader +} + +static void +dump_expr(struct compile_ctx *ctx, struct script_expr *expr, size_t indent) +{ +#define leader(indent) \ + for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + + char *str; + size_t len; + + switch (expr->type) { + case SCR_EXPR_IDENT: + ident_pool_get(&ctx->ident_pool, expr->ident, &str, &len); + + leader(indent) + dbglog(ctx, "ident: %.*s\n", (int) len, str); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, expr->typeinfo, indent + 2); + break; + + case SCR_EXPR_LITERAL_INT: + leader(indent) + dbglog(ctx, "literal int: %" PRIu64 "\n", expr->literal_int); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, expr->typeinfo, indent + 2); + break; + + case SCR_EXPR_BINARY_OP: { + static char const *binary_op_str[] = { + [SCR_BINARY_OP_ADD] = "+", + [SCR_BINARY_OP_SUB] = "-", + [SCR_BINARY_OP_MUL] = "*", + [SCR_BINARY_OP_DIV] = "/", + }; + + leader(indent) + dbglog(ctx, "binary op: %s\n", binary_op_str[expr->binary_op.type]); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, expr->typeinfo, indent + 2); + + leader(indent + 1) + dbglog(ctx, "lhs:\n"); + dump_expr(ctx, expr->binary_op.lhs, indent + 2); + + leader(indent + 1) + dbglog(ctx, "rhs:\n"); + dump_expr(ctx, expr->binary_op.rhs, indent + 2); + } break; + } + +#undef leader +} + +static void +dump_stmt(struct compile_ctx *ctx, struct script_stmt *node, size_t indent) +{ +#define leader(indent) \ + for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + + char *str; + size_t len; + + struct script_list *list; + switch (node->type) { + case SCR_STMT_BLOCK: + leader(indent) + dbglog(ctx, "block\n"); + + list = &node->block.children; + SCRIPT_LIST_ITER(list) { + struct script_stmt *child = + SCRIPT_FROM_NODE(it, struct script_stmt, list_node); + + dump_stmt(ctx, child, indent + 1); + } + break; + + case SCR_STMT_VARDECL: + ident_pool_get(&ctx->ident_pool, node->vardecl.ident, &str, &len); + + leader(indent) + dbglog(ctx, "vardecl: ident: %.*s\n", (int) len, str); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, node->vardecl.typeinfo, indent + 2); + + leader(indent + 1) + dbglog(ctx, "expr:\n"); + dump_expr(ctx, node->vardecl.expr, indent + 2); + break; + + case SCR_STMT_RET: + leader(indent) + dbglog(ctx, "return\n"); + + leader(indent + 1) + dbglog(ctx, "expr:\n"); + dump_expr(ctx, node->ret.expr, indent + 2); + break; + } + +#undef leader +} + +static void +dump_symbol_table(struct compile_ctx *ctx) +{ + dbglog(ctx, "info: symbol table:\n"); + for (size_t i = 0; i < ctx->symtab.len; i++) { + struct script_symbol *sym = &ctx->symtab.ptr[i]; + + dump_symbol(ctx, sym, 1); + } +} + +static void +dump_ast(struct compile_ctx *ctx) +{ + dbglog(ctx, "info: ast:\n"); + SCRIPT_LIST_ITER(&ctx->ast.roots) { + struct script_stmt *stmt = SCRIPT_FROM_NODE(it, struct script_stmt, list_node); + + dump_stmt(ctx, stmt, 1); + } +} + +static inline char const * +dump_ir_opcode_str(enum script_ir_opcode opcode) +{ + switch (opcode) { + case SCR_IR_LOAD: return "LOAD"; + case SCR_IR_STORE: return "STORE"; + case SCR_IR_PUSH: return "PUSH"; + case SCR_IR_POP: return "POP"; + case SCR_IR_RET: return "RET"; + case SCR_IR_ADD: return "ADD"; + case SCR_IR_SUB: return "SUB"; + case SCR_IR_MUL: return "MUL"; + case SCR_IR_DIV: return "DIV"; + } +} + +static inline char const * +dump_ir_typeinfo_str(struct script_ir_typeinfo *typeinfo) +{ + switch (typeinfo->type) { + case SCR_IR_TYPE_U64: return "U64"; + } +} + +static void +dump_ir_operands(struct compile_ctx *ctx, struct script_ir_operand *buf, size_t len) +{ + for (size_t i = 0; i < len; i++) { + struct script_ir_operand *operand = &buf[i]; + + char *str; + size_t str_len; + + switch (operand->type) { + case SCR_IR_OPERAND_LITERAL: + dbglog(ctx, "LITERAL{0x%" PRIx64 "}", operand->literal); + break; + + case SCR_IR_OPERAND_POINTER: + dbglog(ctx, "POINTER{0x%" PRIx64 "}", operand->pointer); + break; + + case SCR_IR_OPERAND_IDENT: + ident_pool_get(&ctx->ident_pool, operand->ident, &str, &str_len); + dbglog(ctx, "IDENT{%.*s}", (int) str_len, str); + break; + } + + dbglog(ctx, ", "); + } +} + +static void +dump_ir(struct compile_ctx *ctx) +{ + dbglog(ctx, "info: ir:\n"); + for (size_t i = 0; i < ctx->ir.len; i++) { + struct script_ir_inst *inst = &ctx->ir.ptr[i]; + + dbglog(ctx, "\t[%03zu] %5s<%s> ", i, + dump_ir_opcode_str(inst->opcode), + dump_ir_typeinfo_str(inst->typeinfo)); + + dump_ir_operands(ctx, inst->operands, inst->operand_count); + + dbglog(ctx, "\n"); + } +} diff --git a/libscript/libscript.c b/libscript/libscript.c @@ -1,29 +1,8 @@ #include "libscript_internal.h" -struct token_stream { - struct script_token *ptr; - size_t len, cur; -}; - -struct compile_ctx { - FILE *errstream; - int verbose; - - struct arena *arena; - - struct token_stream *stream; - - char scratch[64]; -}; - -static inline void -dbglog(struct compile_ctx *ctx, char const *fmt, ...) -{ - va_list va; - va_start(va, fmt); - vfprintf(ctx->errstream, fmt, va); - va_end(va); -} +/* tokeniser (lexer) + * =========================================================================== + */ static int try_tokenise_keyword(char *src, char *end, struct script_token *out) @@ -41,13 +20,13 @@ try_tokenise_keyword(char *src, char *end, struct script_token *out) return -1; } -static struct token_stream -tokenise(struct compile_ctx *ctx, char *src, size_t src_len) +static int +tokenise(struct compile_ctx *ctx) { if (ctx->verbose) - dbglog(ctx, "info: tokenise() for %zu bytes of source\n", src_len); + dbglog(ctx, "info: tokenising %zu bytes of source\n", ctx->len); - char *end = src + src_len; + char *src = ctx->src, *end = ctx->src + ctx->len; struct script_token *token; char *buf = src, *buf_end = src; @@ -73,7 +52,7 @@ tokenise(struct compile_ctx *ctx, char *src, size_t src_len) case SCR_TOKEN_DOT: case SCR_TOKEN_COMMA: case SCR_TOKEN_PLUS: case SCR_TOKEN_MINUS: case SCR_TOKEN_STAR: case SCR_TOKEN_EQUALS: - token = ALLOC_SIZED(ctx->arena, struct script_token); + token = token_stream_alloc(&ctx->stream); assert(token); token->type = (enum script_token_type) lookahead[0]; @@ -102,7 +81,7 @@ tokenise(struct compile_ctx *ctx, char *src, size_t src_len) "'%.*s'\n", (int) (buf_end - buf), buf); } - token = ALLOC_SIZED(ctx->arena, struct script_token); + token = token_stream_alloc(&ctx->stream); assert(token); token->type = SCR_TOKEN_LITERAL_INT; @@ -127,12 +106,13 @@ tokenise(struct compile_ctx *ctx, char *src, size_t src_len) dbglog(ctx, "info: have ident or keyword: '%.*s'\n", (int) (buf_end - buf), buf); - token = ALLOC_SIZED(ctx->arena, struct script_token); + token = token_stream_alloc(&ctx->stream); assert(token); if (try_tokenise_keyword(buf, buf_end, token) < 0) { token->type = SCR_TOKEN_IDENT; - // TODO: get flystr + token->ident = ident_pool_intern(&ctx->ident_pool, + buf, buf_end - buf); } src = buf_end; @@ -146,109 +126,37 @@ next_char: src++; } - return (struct token_stream) { - .ptr = ctx->arena->ptr, - .len = ctx->arena->len / sizeof *token, - .cur = 0, - }; + return 0; error: - return (struct token_stream) {0}; -} - -static inline char const * -token_type_str(enum script_token_type type) -{ - static char const *type_to_str[] = { - [SCR_TOKEN_EOF] = "EOF", - [SCR_TOKEN_IDENT] = "IDENT", - [SCR_TOKEN_LITERAL_INT] = "LITERAL_INT", - [SCR_TOKEN_U64] = "U64", - [SCR_TOKEN_RETURN] = "RETURN", - [SCR_TOKEN_LPAREN] = "(", - [SCR_TOKEN_RPAREN] = ")", - [SCR_TOKEN_LBRACK] = "[", - [SCR_TOKEN_RBRACK] = "]", - [SCR_TOKEN_LBRACE] = "{", - [SCR_TOKEN_RBRACE] = "}", - [SCR_TOKEN_LANGLE] = "<", - [SCR_TOKEN_RANGLE] = ">", - [SCR_TOKEN_LSLASH] = "\\", - [SCR_TOKEN_RSLASH] = "/", - [SCR_TOKEN_COLON] = ":", - [SCR_TOKEN_SEMICOLON] = ";", - [SCR_TOKEN_DOT] = ".", - [SCR_TOKEN_COMMA] = ",", - [SCR_TOKEN_EQUALS] = "=", - [SCR_TOKEN_PLUS] = "+", - [SCR_TOKEN_MINUS] = "-", - [SCR_TOKEN_STAR] = "*", - }; - - return type_to_str[type]; -} - -static inline int -dump_token(struct script_token *token, char *buf, size_t cap) -{ - switch (token->type) { - case SCR_TOKEN_IDENT: - return snprintf(buf, cap, "Token {type: %s, ident: %" PRIu64 "}", - token_type_str(token->type), token->ident.v); - - case SCR_TOKEN_LITERAL_INT: - return snprintf(buf, cap, "Token {type: %s, literal_int: %" PRIu64 "}", - token_type_str(token->type), token->literal_int); - - default: - return snprintf(buf, cap, "Token {type: %s}", token_type_str(token->type)); - } + return -1; } -static void -dump_token_stream(struct compile_ctx *ctx, struct script_token *toks, size_t len) -{ - struct script_token *end = toks + len; - - dbglog(ctx, "token stream: %zu tokens\n", len); - - char buf[64]; - while (toks < end) { - int written = dump_token(toks, buf, sizeof buf); - assert(written); - - dbglog(ctx, "\t%.*s\n", written, buf); - - toks++; - } - - dbglog(ctx, "\n"); -} +/* parser + * =========================================================================== + */ static inline struct script_token peek(struct compile_ctx *ctx, size_t off) { - if (ctx->stream->cur + off >= ctx->stream->len) + if (ctx->stream.cur + off >= ctx->stream.len) return (struct script_token) { .type = SCR_TOKEN_EOF, }; + struct script_token tok = ctx->stream.ptr[ctx->stream.cur + off]; if (ctx->verbose) - dbglog(ctx, "peek(%zu/%zu), %d\n", ctx->stream->cur + off, - ctx->stream->len, ctx->stream->ptr[ctx->stream->cur + off].type); + dbglog(ctx, "info: peek(%zu/%zu) = '%s'\n", ctx->stream.cur + off, + ctx->stream.len, dump_token_type_str(tok.type)); - return ctx->stream->ptr[ctx->stream->cur + off]; + return tok; } static inline struct script_token next(struct compile_ctx *ctx) { - if (ctx->stream->cur > ctx->stream->len) + if (ctx->stream.cur > ctx->stream.len) return (struct script_token) { .type = SCR_TOKEN_EOF, }; - if (ctx->verbose) - dbglog(ctx, "next(%zu/%zu), %d\n", ctx->stream->cur, ctx->stream->len, - ctx->stream->ptr[ctx->stream->cur].type); - - return ctx->stream->ptr[ctx->stream->cur++]; + return ctx->stream.ptr[ctx->stream.cur++]; } static inline struct script_token @@ -257,24 +165,38 @@ expect(struct compile_ctx *ctx, enum script_token_type expected) struct script_token tok = next(ctx); if (ctx->verbose) - dbglog(ctx, "expect(%zu/%zu, T: %d), %d\n", - ctx->stream->cur, ctx->stream->len, expected, tok.type); + dbglog(ctx, "info: expect(%zu/%zu, '%s') = '%s'\n", ctx->stream.cur, + ctx->stream.len, dump_token_type_str(expected), + dump_token_type_str(tok.type)); if (tok.type != expected) { - int len = dump_token(&tok, ctx->scratch, sizeof ctx->scratch); + char buf[64]; + int len = dump_token(&tok, buf, sizeof buf); dbglog(ctx, "error: expected %s, got: %.*s\n", - token_type_str(expected), len, ctx->scratch); + dump_token_type_str(expected), len, buf); PANIC(); } return tok; } +static inline int +is_primitive_typeinfo(struct script_typeinfo *typeinfo) +{ + switch (typeinfo->type) { + case SCR_TYPE_U64: + return 1; + + default: + return 0; + } +} + static struct script_typeinfo * primitive_typeinfo(enum script_type type) { static struct script_typeinfo typeinfos[] = { - [SCR_TOKEN_U64] = { .type = SCR_TYPE_U64, .size = 8, .alignment = 8, }, + [SCR_TYPE_U64] = { .type = SCR_TYPE_U64, .size = 8, .alignment = 8, }, }; return &typeinfos[type]; @@ -289,7 +211,7 @@ literal_typeinfo(struct compile_ctx *ctx, enum script_token_type type) default: { dbglog(ctx, "error: invalid token type has no type info: %s\n", - token_type_str(type)); + dump_token_type_str(type)); return NULL; } break; } @@ -305,13 +227,12 @@ parse_typeinfo(struct compile_ctx *ctx) return primitive_typeinfo(SCR_TYPE_U64); default: { - int len = dump_token(&tok, ctx->scratch, sizeof ctx->scratch); - dbglog(ctx, "error: expected typeinfo, got: %.*s\n", - len, ctx->scratch); + char buf[64]; + int len = dump_token(&tok, buf, sizeof buf); + dbglog(ctx, "error: expected typeinfo, got: %.*s\n", len, buf); return NULL; } break; } - } static struct script_expr * @@ -326,7 +247,7 @@ parse_expr(struct compile_ctx *ctx) struct script_expr *expr; switch (tok.type) { case SCR_TOKEN_IDENT: - expr = ALLOC_SIZED(ctx->arena, struct script_expr); + expr = ALLOC_SIZED(&ctx->arena, struct script_expr); assert(expr); tok = next(ctx); @@ -334,13 +255,16 @@ parse_expr(struct compile_ctx *ctx) expr->type = SCR_EXPR_IDENT; expr->ident = tok.ident; - // TODO: fetch the previously registered typeinfo? - expr->typeinfo = primitive_typeinfo(SCR_TYPE_U64); + struct script_symbol *sym = symbol_table_find(&ctx->symtab, tok.ident); + assert(sym); + + assert(sym->type == SCR_SYMBOL_VARIABLE); + expr->typeinfo = sym->variable.typeinfo; break; case SCR_TOKEN_LITERAL_INT: - expr = ALLOC_SIZED(ctx->arena, struct script_expr); + expr = ALLOC_SIZED(&ctx->arena, struct script_expr); assert(expr); tok = next(ctx); @@ -355,7 +279,7 @@ parse_expr(struct compile_ctx *ctx) case SCR_TOKEN_MINUS: case SCR_TOKEN_STAR: case SCR_TOKEN_RSLASH: - expr = ALLOC_SIZED(ctx->arena, struct script_expr); + expr = ALLOC_SIZED(&ctx->arena, struct script_expr); assert(expr); expr->type = SCR_EXPR_BINARY_OP; @@ -386,41 +310,60 @@ parse_expr(struct compile_ctx *ctx) if (i == sizeof stack) { struct script_token tok = peek(ctx, 0); - int len = dump_token(&tok, ctx->scratch, sizeof ctx->scratch); + + char buf[64]; + int len = dump_token(&tok, buf, sizeof buf); dbglog(ctx, "error: stack overflow while parsing expression: '%.*s'\n", - len, ctx->scratch); + len, buf); + return NULL; } + assert(i == 0); + end: return stack[0]; } static struct script_stmt * -parse_return(struct compile_ctx *ctx) +parse_return(struct compile_ctx *ctx, struct script_symbol *parent) { - struct script_stmt *stmt = ALLOC_SIZED(ctx->arena, struct script_stmt); + (void) parent; + + struct script_stmt *stmt = ALLOC_SIZED(&ctx->arena, struct script_stmt); assert(stmt); stmt->type = SCR_STMT_RET; expect(ctx, SCR_TOKEN_RETURN); stmt->ret.expr = parse_expr(ctx); + /* TODO: validate that return type is the same as parent scope function type */ + expect(ctx, SCR_TOKEN_SEMICOLON); return stmt; } static struct script_stmt * -parse_vardecl(struct compile_ctx *ctx) +parse_vardecl(struct compile_ctx *ctx, struct script_symbol *parent) { - struct script_stmt *stmt = ALLOC_SIZED(ctx->arena, struct script_stmt); + struct script_symbol *sym = symbol_table_push(&ctx->symtab); + assert(sym); + + sym->type = SCR_SYMBOL_VARIABLE; + sym->list_node.prev = sym->list_node.next = NULL; + sym->variable.parent = parent; + + struct script_stmt *stmt = ALLOC_SIZED(&ctx->arena, struct script_stmt); assert(stmt); stmt->type = SCR_STMT_VARDECL; - stmt->vardecl.ident = expect(ctx, SCR_TOKEN_IDENT).ident; + sym->ident = stmt->vardecl.ident = expect(ctx, SCR_TOKEN_IDENT).ident; + expect(ctx, SCR_TOKEN_COLON); - stmt->vardecl.typeinfo = parse_typeinfo(ctx); + sym->variable.typeinfo = stmt->vardecl.typeinfo = parse_typeinfo(ctx); + sym->variable.addr = symbol_table_next_addr(&ctx->symtab, sym->variable.typeinfo); + expect(ctx, SCR_TOKEN_EQUALS); stmt->vardecl.expr = parse_expr(ctx); @@ -432,218 +375,258 @@ parse_vardecl(struct compile_ctx *ctx) } static struct script_stmt * -parse_statement(struct compile_ctx *ctx) +parse_statement(struct compile_ctx *ctx, struct script_symbol *parent) { struct script_token tok = peek(ctx, 0); switch (tok.type) { case SCR_TOKEN_RETURN: - return parse_return(ctx); + return parse_return(ctx, parent); case SCR_TOKEN_IDENT: - return parse_vardecl(ctx); + return parse_vardecl(ctx, parent); default: { - int len = dump_token(&tok, ctx->scratch, sizeof ctx->scratch); + char buf[64]; + int len = dump_token(&tok, buf, sizeof buf); dbglog(ctx, "error: expected a statement, got: '%.*s'\n", - len, ctx->scratch); + len, buf); return NULL; } break; } } -static struct script_stmt * -parse_statement_list(struct compile_ctx *ctx) +static inline struct script_ir_typeinfo * +primitive_ir_typeinfo(enum script_ir_type type) { - struct script_stmt *stmt = ALLOC_SIZED(ctx->arena, struct script_stmt); - assert(stmt); + static struct script_ir_typeinfo typeinfos[] = { + [SCR_IR_TYPE_U64] = { .type = SCR_IR_TYPE_U64, .size = 8, .alignment = 8, }, + }; - stmt->type = SCR_STMT_BLOCK; - stmt->block.children.head = stmt->block.children.tail = NULL; + return &typeinfos[type]; +} - while (peek(ctx, 0).type != SCR_TOKEN_EOF) { - struct script_stmt *child = parse_statement(ctx); - list_push_tail(&stmt->block.children, &child->list_node); +static inline struct script_ir_typeinfo * +typeinfo_to_ir_typeinfo(struct script_typeinfo *typeinfo) +{ + switch (typeinfo->type) { + case SCR_TYPE_U64: return primitive_ir_typeinfo(SCR_IR_TYPE_U64); } - - return stmt; } -static struct script_stmt * -parse(struct compile_ctx *ctx) +static int +emit_expr(struct compile_ctx *ctx, struct script_ir_inst *scratch, struct script_expr *expr) { - struct script_stmt *ast = parse_statement_list(ctx); - assert(ast); + struct script_symbol *sym; + switch (expr->type) { + case SCR_EXPR_IDENT: + sym = symbol_table_find(&ctx->symtab, expr->ident); + assert(sym); + + scratch->opcode = SCR_IR_LOAD; + scratch->typeinfo = typeinfo_to_ir_typeinfo(expr->typeinfo); + scratch->operands[0].type = SCR_IR_OPERAND_POINTER; + scratch->operands[0].pointer = sym->variable.addr; + scratch->operand_count = 1; + break; - assert(ctx->stream->cur == ctx->stream->len); + case SCR_EXPR_LITERAL_INT: + scratch->opcode = SCR_IR_PUSH; + scratch->typeinfo = typeinfo_to_ir_typeinfo(expr->typeinfo); + scratch->operands[0].type = SCR_IR_OPERAND_LITERAL; + scratch->operands[0].literal = expr->literal_int; + scratch->operand_count = 1; + break; - return ast; -} + case SCR_EXPR_BINARY_OP: + if (emit_expr(ctx, scratch, expr->binary_op.rhs) < 0) + return -1; -static void -dump_typeinfo(struct compile_ctx *ctx, struct script_typeinfo *typeinfo, size_t indent) -{ -#define leader(indent) \ - for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + if (emit_expr(ctx, scratch, expr->binary_op.lhs) < 0) + return -1; - switch (typeinfo->type) { - case SCR_TYPE_U64: { - static char const *type_str[] = { - [SCR_TYPE_U64] = "U64", - }; + switch (expr->binary_op.type) { + case SCR_BINARY_OP_ADD: scratch->opcode = SCR_IR_ADD; break; + case SCR_BINARY_OP_SUB: scratch->opcode = SCR_IR_SUB; break; + case SCR_BINARY_OP_MUL: scratch->opcode = SCR_IR_MUL; break; + case SCR_BINARY_OP_DIV: scratch->opcode = SCR_IR_DIV; break; + } - leader(indent) - dbglog(ctx, "Typeinfo { type: %s, size: %zu, alignment: %zu }\n", - type_str[typeinfo->type], typeinfo->size, typeinfo->alignment); - } break; + scratch->typeinfo = typeinfo_to_ir_typeinfo(expr->typeinfo); + scratch->operand_count = 0; + break; } -#undef leader + return ir_push(&ctx->ir, scratch); } -static void -dump_expr(struct compile_ctx *ctx, struct script_expr *expr, size_t indent) +static int +emit_return(struct compile_ctx *ctx, struct script_ir_inst *scratch, struct script_stmt *stmt) { -#define leader(indent) \ - for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + assert(stmt->type == SCR_STMT_RET); + assert(is_primitive_typeinfo(stmt->ret.expr->typeinfo)); - switch (expr->type) { - case SCR_EXPR_IDENT: - leader(indent) - dbglog(ctx, "ident: %" PRIu64 "\n", expr->ident.v); + if (emit_expr(ctx, scratch, stmt->ret.expr) < 0) + return -1; - leader(indent + 1) - dbglog(ctx, "typeinfo:\n"); - dump_typeinfo(ctx, expr->typeinfo, indent + 2); - break; + scratch->opcode = SCR_IR_RET; + scratch->typeinfo = typeinfo_to_ir_typeinfo(stmt->ret.expr->typeinfo); + scratch->operand_count = 0; - case SCR_EXPR_LITERAL_INT: - leader(indent) - dbglog(ctx, "literal int: %" PRIu64 "\n", expr->literal_int); + return ir_push(&ctx->ir, scratch); +} - leader(indent + 1) - dbglog(ctx, "typeinfo:\n"); - dump_typeinfo(ctx, expr->typeinfo, indent + 2); - break; +static int +emit_vardecl(struct compile_ctx *ctx, struct script_ir_inst *scratch, struct script_stmt *stmt) +{ + assert(stmt->type == SCR_STMT_VARDECL); + assert(is_primitive_typeinfo(stmt->vardecl.typeinfo)); - case SCR_EXPR_BINARY_OP: { - static char const *binary_op_str[] = { - [SCR_BINARY_OP_ADD] = "+", - [SCR_BINARY_OP_SUB] = "-", - [SCR_BINARY_OP_MUL] = "*", - [SCR_BINARY_OP_DIV] = "/", - }; + struct script_symbol *sym = symbol_table_find(&ctx->symtab, stmt->vardecl.ident); + assert(sym); - leader(indent) - dbglog(ctx, "binary op: %s\n", binary_op_str[expr->binary_op.type]); + if (emit_expr(ctx, scratch, stmt->vardecl.expr) < 0) + return -1; - leader(indent + 1) - dbglog(ctx, "typeinfo:\n"); - dump_typeinfo(ctx, expr->typeinfo, indent + 2); + scratch->opcode = SCR_IR_STORE; + scratch->typeinfo = typeinfo_to_ir_typeinfo(stmt->vardecl.typeinfo); + scratch->operands[0].type = SCR_IR_OPERAND_POINTER; + scratch->operands[0].pointer = sym->variable.addr; + scratch->operand_count = 1; - leader(indent + 1) - dbglog(ctx, "lhs:\n"); - dump_expr(ctx, expr->binary_op.lhs, indent + 2); + return ir_push(&ctx->ir, scratch); - leader(indent + 1) - dbglog(ctx, "rhs:\n"); - dump_expr(ctx, expr->binary_op.rhs, indent + 2); - } break; - } +#if 0 /* TODO: more advanced rules for emitting variables */ + if (sym->parent) { /* this is a function-local variable, enable full expressions */ + + } else { /* this is a global variable, can only have compiletime expressions */ -#undef leader + } +#endif } -static void -dump_parse_tree(struct compile_ctx *ctx, struct script_stmt *node, size_t indent) -{ -#define leader(indent) \ - for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); +static int +emit(struct compile_ctx *ctx, struct script_ir_inst *scratch, struct script_stmt *stmt); - struct script_list *list; - switch (node->type) { - case SCR_STMT_BLOCK: - leader(indent) - dbglog(ctx, "block\n"); +static int +emit_block(struct compile_ctx *ctx, struct script_ir_inst *scratch, struct script_stmt *stmt) +{ + (void) ctx; + (void) scratch; + (void) stmt; - list = &node->block.children; - SCRIPT_LIST_ITER(list) { - struct script_stmt *child = - SCRIPT_FROM_NODE(it, struct script_stmt, list_node); + /* TODO: reserve enough stack space for all locals */ + /* TODO: implement me */ - dump_parse_tree(ctx, child, indent + 1); - } - break; + return -1; +} - case SCR_STMT_VARDECL: - leader(indent) - dbglog(ctx, "vardecl: ident: %" PRIu64 "\n", node->vardecl.ident.v); +static int +emit(struct compile_ctx *ctx, struct script_ir_inst *scratch, struct script_stmt *stmt) +{ + switch (stmt->type) { + case SCR_STMT_BLOCK: return emit_block(ctx, scratch, stmt); + case SCR_STMT_VARDECL: return emit_vardecl(ctx, scratch, stmt); + case SCR_STMT_RET: return emit_return(ctx, scratch, stmt); + } - leader(indent + 1) - dbglog(ctx, "typeinfo:\n"); - dump_typeinfo(ctx, node->vardecl.typeinfo, indent + 2); + return -1; +} - leader(indent + 1) - dbglog(ctx, "expr:\n"); - dump_expr(ctx, node->vardecl.expr, indent + 2); - break; +static int +parse(struct compile_ctx *ctx) +{ + ctx->ast.roots.head = ctx->ast.roots.tail = NULL; - case SCR_STMT_RET: - leader(indent) - dbglog(ctx, "return\n"); + struct script_ir_inst inst; + while (peek(ctx, 0).type != SCR_TOKEN_EOF) { + struct script_stmt *stmt = parse_statement(ctx, NULL); + list_push_tail(&ctx->ast.roots, &stmt->list_node); - leader(indent + 1) - dbglog(ctx, "expr:\n"); - dump_expr(ctx, node->ret.expr, indent + 2); - break; + if (emit(ctx, &inst, stmt) < 0) + return -1; } -#undef leader + assert(ctx->stream.cur == ctx->stream.len); + + return 0; } +/* libscript + * =========================================================================== + */ + int -script_parse(char *src, size_t src_len, void *mem, size_t mem_len, - struct script_stmt **out, FILE *errstream, int verbose) +script_compile(char *src, size_t src_len, void *mem, size_t mem_len, + struct script_program *out, FILE *errstream, int verbose) { - struct compile_ctx ctx = { - .errstream = errstream, - .verbose = verbose, - }; - - struct arena arena = { - .ptr = mem, - .cap = mem_len, - .len = 0, - }; - - if (ctx.verbose) - dbglog(&ctx, "info: arena cap: %zu bytes, verbose: %d\n", - arena.cap, verbose); - - ctx.arena = &arena; + assert(mem_len > sizeof(struct compile_ctx)); + + struct compile_ctx *ctx = mem; + ctx->arena.ptr = (char *) mem + sizeof *ctx; + ctx->arena.cap = mem_len - sizeof *ctx; + ctx->arena.len = 0; + + ctx->scratch.cap = SCRIPT_COMPILE_SCRATCH_BYTES; + ctx->scratch.ptr = ALLOC_ARRAY(&ctx->arena, char, ctx->scratch.cap); + ctx->scratch.len = 0; + + ctx->errstream = errstream; + ctx->verbose = verbose; + ctx->src = src; + ctx->len = src_len; + + ctx->ident_pool.cap = SCRIPT_COMPILE_MAX_IDENTS; + ctx->ident_pool.ptr = ALLOC_ARRAY(&ctx->arena, struct identifier, ctx->ident_pool.cap); + ctx->ident_pool.len = 0; + assert(ctx->ident_pool.ptr); + + ctx->stream.cap = SCRIPT_COMPILE_MAX_TOKS; + ctx->stream.ptr = ALLOC_ARRAY(&ctx->arena, struct script_token, ctx->stream.cap); + ctx->stream.len = ctx->stream.cur = 0; + assert(ctx->stream.ptr); + + ctx->symtab.cap = SCRIPT_COMPILE_MAX_SYMS; + ctx->symtab.ptr = ALLOC_ARRAY(&ctx->arena, struct script_symbol, ctx->symtab.cap); + ctx->symtab.len = 0; + assert(ctx->symtab.ptr); + + ctx->ir.cap = SCRIPT_COMPILE_MAX_IR_INSTRS; + ctx->ir.ptr = ALLOC_ARRAY(&ctx->arena, struct script_ir_inst, ctx->ir.cap); + ctx->ir.len = 0; + assert(ctx->ir.ptr); + + if (ctx->verbose) { + dbglog(ctx, "info: arena cap: %zu bytes, scratch cap: %zu bytes, verbose: %d\n", + ctx->arena.cap, ctx->scratch.cap, verbose); + } - struct token_stream stream = tokenise(&ctx, src, src_len); - if (!stream.ptr) { - dbglog(&ctx, "error: failed to tokenise source\n"); + int res; + if ((res = tokenise(ctx)) < 0) { + dbglog(ctx, "error: failed to tokenise source\n"); return -1; } - if (ctx.verbose) - dump_token_stream(&ctx, stream.ptr, stream.len); - - ctx.stream = &stream; + if (ctx->verbose) + dump_token_stream(ctx); - struct script_stmt *stmt = parse(&ctx); - if (!stmt) { - dbglog(&ctx, "error: failed to parse source\n"); + if ((res = parse(ctx)) < 0) { + dbglog(ctx, "error: failed to parse source\n"); return -1; } - if (verbose) - dump_parse_tree(&ctx, stmt, 0); + if (ctx->verbose) { + dump_symbol_table(ctx); + dbglog(ctx, "\n"); + dump_ast(ctx); + dbglog(ctx, "\n"); + dump_ir(ctx); + } - *out = stmt; + out->instructions.ptr = ctx->ir.ptr; + out->instructions.len = ctx->ir.len; + out->max_heap_bytes = ctx->symtab.address; return 0; } -#include "utils.c" +#include "debug.c" diff --git a/libscript/libscript.h b/libscript/libscript.h @@ -104,6 +104,26 @@ struct script_typeinfo { size_t alignment; }; +enum script_symbol_type { + SCR_SYMBOL_VARIABLE, +}; + +struct script_symbol { + enum script_symbol_type type; + + struct script_flystr ident; + + union { + struct { + struct script_symbol *parent; + struct script_typeinfo *typeinfo; + uintptr_t addr; + } variable; + }; + + struct script_list_node list_node; +}; + enum script_expr_type { SCR_EXPR_IDENT, SCR_EXPR_LITERAL_INT, @@ -161,12 +181,86 @@ struct script_stmt { struct script_list_node list_node; }; +/* ir + * =========================================================================== + */ + +enum script_ir_opcode { + SCR_IR_LOAD, + SCR_IR_STORE, + + SCR_IR_PUSH, + SCR_IR_POP, + SCR_IR_RET, + + SCR_IR_ADD, + SCR_IR_SUB, + SCR_IR_MUL, + SCR_IR_DIV, +}; + +enum script_ir_type { + SCR_IR_TYPE_U64, +}; + +struct script_ir_typeinfo { + enum script_ir_type type; + size_t size, alignment; +}; + +enum script_ir_operand_type { + SCR_IR_OPERAND_LITERAL, + SCR_IR_OPERAND_POINTER, + SCR_IR_OPERAND_IDENT, +}; + +struct script_ir_operand { + enum script_ir_operand_type type; + union { + uint64_t literal; + uintptr_t pointer; + struct script_flystr ident; + }; +}; + +#define SCRIPT_IR_INST_MAX_OPERANDS 1 + +struct script_ir_inst { + enum script_ir_opcode opcode; + struct script_ir_typeinfo *typeinfo; + struct script_ir_operand operands[SCRIPT_IR_INST_MAX_OPERANDS]; + size_t operand_count; +}; + /* libscript * =========================================================================== */ +#define SCRIPT_COMPILE_SCRATCH_BYTES 1024 + +#define SCRIPT_COMPILE_MAX_IDENTS 4096 + +#define SCRIPT_COMPILE_MAX_TOKS 8192 +#define SCRIPT_COMPILE_MAX_SYMS 2048 + +#define SCRIPT_COMPILE_MAX_STMTS 4096 +#define SCRIPT_COMPILE_MAX_EXPRS 4096 + +#define SCRIPT_COMPILE_MAX_EXPR_DEPTH 16 + +#define SCRIPT_COMPILE_MAX_IR_INSTRS 4096 + +struct script_program { + struct { + struct script_ir_inst *ptr; + size_t len; + } instructions; + + size_t max_heap_bytes; +}; + extern int -script_parse(char *src, size_t src_len, void *mem, size_t mem_len, - struct script_stmt **out, FILE *errstream, int verbose); +script_compile(char *src, size_t src_len, void *mem, size_t mem_len, + struct script_program *out, FILE *errstream, int verbose); #endif /* LIBSCRIPT_H */ diff --git a/libscript/libscript_internal.h b/libscript/libscript_internal.h @@ -10,6 +10,10 @@ #define UNREACHABLE() (*((volatile char *) 0) = 0) #define PANIC() UNREACHABLE() +/* utilities + * =========================================================================== + */ + #define ALIGN_PREV(v, align) ((v) & ~((align) - 1)) #define ALIGN_NEXT(v, align) ALIGN_PREV((v) + ((align) - 1), (align)) @@ -18,13 +22,13 @@ struct arena { size_t cap, len; }; -inline void +static inline void arena_reset(struct arena *arena) { arena->len = 0; } -inline void * +static inline void * arena_alloc(struct arena *arena, size_t size, size_t align) { size_t aligned_len = ALIGN_NEXT(arena->len, align); @@ -42,7 +46,7 @@ arena_alloc(struct arena *arena, size_t size, size_t align) #define ALLOC_SIZED(arena, T) ALLOC_ARRAY((arena), T, 1) -inline void +static inline void list_push_head(struct script_list *restrict list, struct script_list_node *restrict node) { @@ -56,7 +60,7 @@ list_push_head(struct script_list *restrict list, list->head = node; } -inline void +static inline void list_push_tail(struct script_list *restrict list, struct script_list_node *restrict node) { @@ -70,7 +74,7 @@ list_push_tail(struct script_list *restrict list, list->tail = node; } -inline struct script_list_node * +static inline struct script_list_node * list_pop_head(struct script_list *list) { if (!list->head) @@ -81,7 +85,7 @@ list_pop_head(struct script_list *list) return node; } -inline struct script_list_node * +static inline struct script_list_node * list_pop_tail(struct script_list *list) { if (!list->tail) @@ -92,4 +96,174 @@ list_pop_tail(struct script_list *list) return node; } +struct identifier { + char *ptr; + size_t len; +}; + +struct ident_pool { + struct identifier *ptr; + size_t cap, len; +}; + +static inline struct script_flystr +ident_pool_intern(struct ident_pool *pool, char *str, size_t len) +{ + /* TODO: switch to a hash table, or anything better than linear search */ + size_t i; + for (i = 0; i < pool->len; i++) { + struct identifier *ident = &pool->ptr[i]; + + if (ident->len == len && strncmp(ident->ptr, str, len) == 0) + return (struct script_flystr) { .v = i, }; + } + + assert(i < pool->cap); + + pool->ptr[i].ptr = str; + pool->ptr[i].len = len; + pool->len++; + + return (struct script_flystr) { .v = i, }; +} + +static inline void +ident_pool_get(struct ident_pool *pool, struct script_flystr ident, char **str, size_t *len) +{ + struct identifier *identifier = &pool->ptr[ident.v]; + + *str = identifier->ptr; + *len = identifier->len; +} + +struct token_stream { + struct script_token *ptr; + size_t cap, len, cur; +}; + +static inline struct script_token * +token_stream_alloc(struct token_stream *stream) +{ + if (stream->len >= stream->cap) + return NULL; + + return &stream->ptr[stream->len++]; +} + +struct symbol_table { + struct script_symbol *ptr; + size_t cap, len; + + uintptr_t address; +}; + +static inline struct script_symbol * +symbol_table_push(struct symbol_table *symtab) +{ + if (symtab->len >= symtab->cap) + return NULL; + + return &symtab->ptr[symtab->len++]; +} + +static inline struct script_symbol * +symbol_table_push_scope(struct symbol_table *symtab) +{ + return symbol_table_push(symtab); +} + +static inline void +symbol_table_pop_scope(struct symbol_table *symtab, struct script_symbol *scope) +{ + symtab->len = scope - symtab->ptr; +} + +static inline struct script_symbol * +symbol_table_find(struct symbol_table *symtab, struct script_flystr ident) +{ + for (size_t i = 1; i <= symtab->len; i++) { + struct script_symbol *sym = &symtab->ptr[symtab->len - i]; + if (sym->ident.v == ident.v) return sym; + } + + return NULL; +} + +static inline uintptr_t +symbol_table_next_addr(struct symbol_table *symtab, + struct script_typeinfo const *typeinfo) +{ + uintptr_t addr = ALIGN_NEXT(symtab->address, typeinfo->alignment); + symtab->address = addr + typeinfo->size; + return addr; +} + +struct ast { + struct script_list roots; +}; + +struct ir { + struct script_ir_inst *ptr; + size_t cap, len; +}; + +static inline int +ir_push(struct ir *ir, struct script_ir_inst *inst) +{ + if (ir->len >= ir->cap) + return -1; + + struct script_ir_inst *instruction = &ir->ptr[ir->len++]; + *instruction = *inst; + + return 0; +} + +struct compile_ctx { + struct arena arena; + struct arena scratch; + + FILE *errstream; + int verbose; + char *src; + size_t len; + + struct ident_pool ident_pool; + + struct token_stream stream; + + struct symbol_table symtab; + + struct ast ast; + + struct ir ir; +}; + +static inline void +dbglog(struct compile_ctx *ctx, char const *fmt, ...) +{ + va_list va; + va_start(va, fmt); + vfprintf(ctx->errstream, fmt, va); + va_end(va); +} + +static char const * +dump_token_type_str(enum script_token_type type); + +static int +dump_token(struct script_token *token, char *buf, size_t cap); + +static void +dump_token_stream(struct compile_ctx *ctx); + +static void +dump_symbol_table(struct compile_ctx *ctx); + +static void +dump_ast(struct compile_ctx *ctx); + +static void +dump_ir(struct compile_ctx *ctx); + #endif /* LIBSCRIPT_INTERNAL_H */ diff --git a/libscript/utils.c b/libscript/utils.c @@ -1,21 +0,0 @@ -#include "libscript_internal.h" - -extern inline void -arena_reset(struct arena *arena); - -extern inline void * -arena_alloc(struct arena *arena, size_t size, size_t align); - -extern inline void -list_push_head(struct script_list *restrict list, - struct script_list_node *restrict node); - -extern inline void -list_push_tail(struct script_list *restrict list, - struct script_list_node *restrict node); - -extern inline struct script_list_node * -list_pop_head(struct script_list *list); - -extern inline struct script_list_node * -list_pop_tail(struct script_list *list); diff --git a/scriptvm/scriptvm.c b/scriptvm/scriptvm.c @@ -1,14 +1,24 @@ +#define _XOPEN_SOURCE 500 + #include "libscript.h" +#include <inttypes.h> #include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + #include <getopt.h> -#define MiB (1024 * 1024) +#define KiB (1024ULL) +#define MiB (1024 * KiB) + +#define IS_ALIGNED(v, align) (((v) & ((align) - 1)) == 0) struct { int verbose; FILE *logfile; - uint64_t mem; + uint64_t compiler_mem, vm_stack_mem, vm_heap_mem; + int vm_emit_bytecode; struct { char **ptr; @@ -17,22 +27,28 @@ struct { } opts = { .verbose = 0, .logfile = NULL, - .mem = 8192 * 1024, + .compiler_mem = 8 * MiB, + .vm_stack_mem = 4 * KiB, + .vm_heap_mem = 4 * KiB, .sources.ptr = NULL, .sources.len = 0, }; -#define OPTSTR "hvf:m:" +#define OPTSTR "hvf:m:ES:M:" static void usage(char *prog) { - fprintf(stderr, "Usage: %s [-hv] [-m <mem-cap-mib>] sources...\n", prog); + fprintf(stderr, "Usage: %s [-hv] [-m <mem-cap-mib>] " + "[-E] [-S <stack-kib>] [-M <head-kib>] sources...\n", prog); fprintf(stderr, "\t-h : display usage information\n"); fprintf(stderr, "\t-v : enable verbose logging\n"); fprintf(stderr, "\t-f : file to log compilation errors to (default: stderr)\n"); fprintf(stderr, "\t-m : maximum memory for compilation, in MiB (default: 8 MiB)\n"); + fprintf(stderr, "\t-E : emit human-readable ir for interpreted instructions\n"); + fprintf(stderr, "\t-S : maximum stack memory for vm, in KiB (default: 4096 KiB)\n"); + fprintf(stderr, "\t-M : maximum heap memory for vm, in KiB (default: 4096 KiB)\n"); fprintf(stderr, "\tsources... : the source files to interpret\n"); } @@ -54,12 +70,30 @@ parse_opts(int argc, char **argv) break; case 'm': - if (!(opts.mem = strtoull(optarg, NULL, 0) * MiB)) { + if (!(opts.compiler_mem = strtoull(optarg, NULL, 0) * MiB)) { fprintf(stderr, "Failed to parse memory limit: %s\n", optarg); return -1; } break; + case 'E': + opts.vm_emit_bytecode = 1; + break; + + case 'S': + if (!(opts.vm_stack_mem = strtoull(optarg, NULL, 0) * KiB)) { + fprintf(stderr, "Failed to parse stack memory limit: %s\n", optarg); + return -1; + } + break; + + case 'M': + if (!(opts.vm_heap_mem = strtoull(optarg, NULL, 0) * KiB)) { + fprintf(stderr, "Failed to parse heap memory limit: %s\n", optarg); + return -1; + } + break; + default: return -1; } @@ -80,11 +114,199 @@ parse_opts(int argc, char **argv) } static void -interpret(struct script_stmt *ast) +emit(int fd, struct script_program const *prog) +{ + (void) fd; + (void) prog; + + // TODO: walk the ast and emit bytecode for it + + fprintf(stderr, "TODO!!!\n"); +} + +struct vm_state { + size_t pc, sp; + + uint64_t r0; + + struct { + unsigned char *ptr; + size_t len; + } stack, heap; +}; + +static void +dump_vm_state(struct vm_state *vm) +{ + fprintf(stderr, "vm state:\n"); + fprintf(stderr, "\tpc: 0x%" PRIx64 ", sp: 0x%" PRIu64 "\n", vm->pc, vm->sp); + fprintf(stderr, "\tr0: 0x%" PRIx64 "\n", vm->r0); +} + +static inline uint64_t +pop(struct vm_state *vm, struct script_ir_typeinfo *typeinfo) +{ + uint64_t value = 0; + + assert(typeinfo->size <= vm->sp); /* stack underrun */ + vm->sp -= typeinfo->size; + + unsigned char *storage = vm->stack.ptr + vm->sp; + assert(IS_ALIGNED((uintptr_t) storage, typeinfo->alignment)); + memcpy(&value, storage, typeinfo->size); + + return value; +} + +static inline void +push(struct vm_state *vm, struct script_ir_typeinfo *typeinfo, uint64_t value) { - (void) ast; + assert(vm->sp + typeinfo->size < vm->stack.len); - // TODO: interpret it + unsigned char *storage = vm->stack.ptr + vm->sp; + assert(IS_ALIGNED((uintptr_t) storage, typeinfo->alignment)); + memcpy(storage, &value, typeinfo->size); + + vm->sp += typeinfo->size; +} + +static inline void +load(struct vm_state *vm, struct script_ir_typeinfo *typeinfo, uintptr_t addr) +{ + uint64_t value = 0; + + assert(addr + typeinfo->size < vm->heap.len); + unsigned char *storage = vm->heap.ptr + addr; + assert(IS_ALIGNED((uintptr_t) storage, typeinfo->alignment)); + memcpy(&value, storage, typeinfo->size); + + push(vm, typeinfo, value); +} + +static inline void +store(struct vm_state *vm, struct script_ir_typeinfo *typeinfo, uintptr_t addr) +{ + uint64_t value = pop(vm, typeinfo); + + assert(addr + typeinfo->size < vm->heap.len); + unsigned char *storage = vm->heap.ptr + addr; + assert(IS_ALIGNED((uintptr_t) storage, typeinfo->alignment)); + memcpy(storage, &value, typeinfo->size); +} + +static inline uint64_t +vmadd(struct script_ir_typeinfo *typeinfo, uint64_t lhs, uint64_t rhs) +{ + switch (typeinfo->type) { + case SCR_IR_TYPE_U64: + return lhs + rhs; + } +} + +static inline uint64_t +vmsub(struct script_ir_typeinfo *typeinfo, uint64_t lhs, uint64_t rhs) +{ + switch (typeinfo->type) { + case SCR_IR_TYPE_U64: + return lhs - rhs; + } +} + +static inline uint64_t +vmmul(struct script_ir_typeinfo *typeinfo, uint64_t lhs, uint64_t rhs) +{ + switch (typeinfo->type) { + case SCR_IR_TYPE_U64: + return lhs * rhs; + } +} + +static inline uint64_t +vmdiv(struct script_ir_typeinfo *typeinfo, uint64_t lhs, uint64_t rhs) +{ + switch (typeinfo->type) { + case SCR_IR_TYPE_U64: + return lhs / rhs; + } +} + +static inline void +arithmetic(struct vm_state *vm, enum script_ir_opcode opcode, + struct script_ir_typeinfo *typeinfo) +{ + uint64_t lhs = pop(vm, typeinfo); + uint64_t rhs = pop(vm, typeinfo); + + uint64_t res; + switch (opcode) { + case SCR_IR_ADD: res = vmadd(typeinfo, lhs, rhs); break; + case SCR_IR_SUB: res = vmsub(typeinfo, lhs, rhs); break; + case SCR_IR_MUL: res = vmmul(typeinfo, lhs, rhs); break; + case SCR_IR_DIV: res = vmdiv(typeinfo, lhs, rhs); break; + default: assert(0); + } + + if (opts.verbose) + fprintf(stderr, "arith: %d, lhs: 0x%" PRIx64 ", rhs: 0x%" PRIx64 ", " + "res: 0x%" PRIx64 "\n", opcode, lhs, rhs, res); + + push(vm, typeinfo, res); +} + +static uint64_t +interpret(struct vm_state *vm, struct script_program *program) +{ + while (vm->pc < program->instructions.len) { + struct script_ir_inst *inst = &program->instructions.ptr[vm->pc]; + + if (opts.verbose) + fprintf(stderr, "inst: %d\n", inst->opcode); + + switch (inst->opcode) { + case SCR_IR_LOAD: + assert(inst->operand_count); + assert(inst->operands[0].type == SCR_IR_OPERAND_POINTER); + load(vm, inst->typeinfo, inst->operands[0].pointer); + break; + + case SCR_IR_STORE: + assert(inst->operand_count); + assert(inst->operands[0].type == SCR_IR_OPERAND_POINTER); + store(vm, inst->typeinfo, inst->operands[0].pointer); + break; + + case SCR_IR_PUSH: + assert(inst->operand_count); + assert(inst->operands[0].type == SCR_IR_OPERAND_LITERAL); + push(vm, inst->typeinfo, inst->operands[0].literal); + break; + + case SCR_IR_POP: + assert(inst->operand_count == 0); + (void) pop(vm, inst->typeinfo); + break; + + case SCR_IR_RET: + assert(inst->operand_count == 0); + vm->r0 = pop(vm, inst->typeinfo); + break; + + case SCR_IR_ADD: + case SCR_IR_SUB: + case SCR_IR_MUL: + case SCR_IR_DIV: + assert(inst->operand_count == 0); + arithmetic(vm, inst->opcode, inst->typeinfo); + break; + } + + vm->pc++; + + if (opts.verbose) + dump_vm_state(vm); + } + + return vm->r0; } int @@ -95,18 +317,18 @@ main(int argc, char **argv) exit(EXIT_FAILURE); } - void *mem = malloc(opts.mem); - assert(mem); + void *compiler_mem = malloc(opts.compiler_mem); + assert(compiler_mem); + memset(compiler_mem, 0, opts.compiler_mem); - memset(mem, 0, opts.mem); + unsigned char *vm_stack_mem = malloc(opts.vm_stack_mem); + assert(vm_stack_mem); - size_t mem_len = opts.mem; + unsigned char *vm_heap_mem = malloc(opts.vm_heap_mem); + assert(vm_heap_mem); - // TODO: mmap files in? read them into a single buffer? for (size_t i = 0; i < opts.sources.len; i++) { char *source = opts.sources.ptr[i]; - fprintf(stderr, "Interpreting source file: %s\n", source); - FILE *fp = fopen(source, "r"); if (!fp) { fprintf(stderr, "Failed to open source file: %s\n", source); @@ -125,17 +347,63 @@ main(int argc, char **argv) fclose(fp); - fprintf(stderr, "\tRead %zu bytes of file, parsing...\n", src_len); + fprintf(stderr, "[%s] Read %zu bytes of source file\n", source, src_len); - struct script_stmt *ast; - if (script_parse(src, src_len, mem, mem_len, &ast, - opts.logfile, opts.verbose) < 0) { - fprintf(stderr, "Failed to parse source file: %s\n", source); - continue; + struct script_program program; + if (script_compile(src, src_len, compiler_mem, opts.compiler_mem, + &program, opts.logfile, opts.verbose) < 0) { + fprintf(stderr, "Failed to compile source file: %s\n", source); + goto next_source; + } + + fprintf(stderr, "[%s] Compiled program: %zu instructions, uses %zu bytes of heap\n", + source, program.instructions.len, program.max_heap_bytes); + + if (opts.vm_heap_mem < program.max_heap_bytes) { + fprintf(stderr, "\tNot enough heap memory to run this program!\n"); + goto next_source; + } + + if (opts.vm_emit_bytecode) { + char tmpfile_path[128] = "/tmp/bytecode-XXXXXX"; + + int fd = mkstemp(tmpfile_path); + assert(fd > 0); + + fprintf(stderr, "[%s] Emitting bytecode to file: %s\n", + source, tmpfile_path); + + emit(fd, &program); + + close(fd); } - // emit(ast); // TODO: dump out bytecode to a file? - interpret(ast); + fprintf(stderr, "[%s] Interpreting with %zu bytes of stack, and %zu bytes of heap\n", + source, opts.vm_stack_mem, opts.vm_heap_mem); + + struct vm_state vm; + memset(&vm, 0, sizeof vm); + + memset(vm_stack_mem, 0, opts.vm_stack_mem); + vm.stack.ptr = vm_stack_mem; + vm.stack.len = opts.vm_stack_mem; + + memset(vm_heap_mem, 0, opts.vm_heap_mem); + vm.heap.ptr = vm_heap_mem; + vm.heap.len = opts.vm_heap_mem; + + if (opts.verbose) + dump_vm_state(&vm); + + uint64_t res = interpret(&vm, &program); + + fprintf(stderr, "[%s] Finished execution with value: 0x%" PRIx64 "\n", + source, res); + + dump_vm_state(&vm); + +next_source: + free(src); } exit(EXIT_SUCCESS);