script

script.git
git clone git://git.lenczewski.org/script.git
Log | Files | Refs

commit ea67d2dce57bf5c68942c320c78790c80d7acc36
Author: MikoĊ‚aj Lenczewski <mblenczewski@gmail.com>
Date:   Sun, 12 Jan 2025 20:26:49 +0000

Initial commit

Start parser for simplified script grammar, including just blocks,
variable declarations, and return statements. Start both scriptvm and
scriptcc drivers, for interpreting script via a bytecode vm and
compiling script into a native executable, respectively.

Have yet to finish the scriptvm stub to evaluate the parsed AST. Need to
track variable declaration typeinfos instead of kludg-ing with the
default typeinfo. Need to handle literal typeinfos better.

Diffstat:
A.editorconfig | 17+++++++++++++++++
A.gitignore | 6++++++
AREADME.txt | 2++
Abuild.sh | 26++++++++++++++++++++++++++
Aclean.sh | 5+++++
Adebug.sh | 16++++++++++++++++
Adocs/script.grammar | 140+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adocs/simple.grammar | 64++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aexamples/test.script | 2++
Alibscript/libscript.c | 649+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibscript/libscript.h | 172+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibscript/libscript_internal.h | 95+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alibscript/utils.c | 21+++++++++++++++++++++
Ascriptcc/scriptcc.c | 7+++++++
Ascriptvm/scriptvm.c | 142+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
15 files changed, 1364 insertions(+), 0 deletions(-)

diff --git a/.editorconfig b/.editorconfig @@ -0,0 +1,17 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +charset = utf-8 + +guidelines = 80, 120, 160 + +[*.{c,h}] +indent_style = tab +indent_size = 8 + +[*.{grammar,md,txt}] +indent_style = space +indent_size = 2 diff --git a/.gitignore b/.gitignore @@ -0,0 +1,6 @@ +bin/ +obj/ +out/ + +**/.*.swp +imgui.ini diff --git a/README.txt b/README.txt @@ -0,0 +1,2 @@ +script +============================================================================== diff --git a/build.sh b/build.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +CC="${CC:-clang}" +AR="${AR:-llvm-ar}" +RANLIB="${RANLIB:-llvm-ranlib}" + +WARNINGS="-Wall -Wextra -Wpedantic ${WERROR:+-Werror}" + +CFLAGS="-std=c11 -O0 -g" +CPPFLAGS="-UNDEBUG" +LDFLAGS="" + +set -ex + +mkdir -p bin obj + +$CC -o obj/libscript.o -c libscript/libscript.c \ + $WARNINGS $CFLAGS $CPPFLAGS $LDFLAGS +$AR rcs bin/libscript.a obj/libscript.o +$RANLIB bin/libscript.a + +$CC -o bin/scriptvm scriptvm/scriptvm.c bin/libscript.a \ + $WARNINGS $CFLAGS $CPPFLAGS -I libscript/ $LDFLAGS + +#$CC -o bin/scriptcc scriptcc/scriptcc.c bin/libscript.a \ +# $WARNINGS $CFLAGS $CPPFLAGS -I libscript/ $LDFLAGS diff --git a/clean.sh b/clean.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +set -ex + +rm -rf bin/ obj/ out/ diff --git a/debug.sh b/debug.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +case "$1" in + vm) + lldbgui -- bin/scriptvm -v examples/test.script + ;; + cc) + mkdir -p out + lldbgui -- bin/scriptcc -v -o out/test examples/test.scipt + ;; + + *) + echo "Unknown mode: ${1:-<none>}, must be one of: vm cc" + exit 1 + ;; +esac diff --git a/docs/script.grammar b/docs/script.grammar @@ -0,0 +1,140 @@ +program = + | symbol program + | $eof + ; + +identifier = + | $(sequence of alphanumeric chars or '_' not starting with a number) + ; + +literal = + | int-literal + | chr-literal + | str-literal + ; + +int-literal = + | $(optionally prefixed series of numbers) + ; + +chr-literal = + | '\'' $(optionally escaped ascii codepoint) '\'' + ; + +str-literal = + | '"' $(sequence of optionally escaped ascii codepoints) '"' + ; + +symbol = + | type-definition + | func-definition + | var-definition + ; + +type-definition = + | identifier-list '::' typename + ; + +identifier-list = + | identifier + | identifier ',' identifier-list + ; + +typename = + | 'utf8' + | 'chr8' + | 'u64' + | 's64' + | identifier + | pointer-type + | array-type + | struct-type + ; + +pointer-type = + | '*' typename + ; + +array-type = + | '[' int-literal ']' typename + ; + +struct-type = + | 'struct' '{' [ struct-member-list ] '}' + ; + +struct-member-list = + | struct-member + | struct-member struct-member-list + ; + +struct-member = + | identifier-list ':' typename ';' + ; + +var-definition = + | identifier-list ':' typename [ '=' initialiser ] ';' + ; + +initialiser = + | literal + | expression + | '.' identifier '=' initialiser + | '{' [ initialiser-list ] '}' + ; + +expression = + | literal + | identifier + ; + +initialiser-list = + | initialiser + | initialiser ',' initialiser-list + ; + +fn-definition = + | identifier '::' '(' [ fn-parameter-list ] ')' ':' fn-rettype fn-body + ; + +fn-parameter-list = + | fn-parameter + | fn-parameter ',' fn-parameter-list + ; + +fn-parameter = + | identifier ':' typename + ; + +fn-rettype = + | 'void' + | typename + ; + +fn-body = + | '{' [ statement-list ] '}' + ; + +statement-list = + | statement + | statement statement-list + ; + +statement = + | var-definition + | block-statement + | return-statement + | expr-statement + ; + +block-statement = + | '{' [ statement-list ] '}' + ; + +return-statement = + | 'return' [ expression ] ';' + ; + +expr-statement = + | expression ';' + ; diff --git a/docs/simple.grammar b/docs/simple.grammar @@ -0,0 +1,64 @@ +program = + | [ statement-list ] $eof + ; + +statement-list = + | statement + | statement statement-list + ; + +statement = + | null-statement + | decl-statement + | ret-statement + | block-statement + ; + +null-statement = + | ';' + ; + +decl-statement = + | ident ':' type '=' expr ';' + ; + +ret-statement = + | 'return' ident ';' + ; + +block-statement = + | '{' [ statement-list ] '}' + ; + +ident = + | ? c-style identifier ? + ; + +type = + | 'u64' + ; + +expr = + | ident + | literal + | binary-op + ; + +binary-op = + | expr expr binary-operator + ; + +binary-operator = + | '+' + | '-' + | '*' + | '/' + ; + +literal = + | integer-literal + ; + +integer-literal = + | ? c-style integer literal ? + ; diff --git a/examples/test.script b/examples/test.script @@ -0,0 +1,2 @@ +x : u64 = 0; +return 42 1 + x *; diff --git a/libscript/libscript.c b/libscript/libscript.c @@ -0,0 +1,649 @@ +#include "libscript_internal.h" + +struct token_stream { + struct script_token *ptr; + size_t len, cur; +}; + +struct compile_ctx { + FILE *errstream; + int verbose; + + struct arena *arena; + + struct token_stream *stream; + + char scratch[64]; +}; + +static inline void +dbglog(struct compile_ctx *ctx, char const *fmt, ...) +{ + va_list va; + va_start(va, fmt); + vfprintf(ctx->errstream, fmt, va); + va_end(va); +} + +static int +try_tokenise_keyword(char *src, char *end, struct script_token *out) +{ + size_t len = end - src; + + if (len == strlen("return") && strncmp(src, "return", len) == 0) { + out->type = SCR_TOKEN_RETURN; + return 0; + } else if (len == strlen("u64") && strncmp(src, "u64", len) == 0) { + out->type = SCR_TOKEN_U64; + return 0; + } + + return -1; +} + +static struct token_stream +tokenise(struct compile_ctx *ctx, char *src, size_t src_len) +{ + if (ctx->verbose) + dbglog(ctx, "info: tokenise() for %zu bytes of source\n", src_len); + + char *end = src + src_len; + + struct script_token *token; + char *buf = src, *buf_end = src; + + while (src < end) { + char lookahead[] = { + src[0], + (src + 1 < end) ? src[1] : '\0', + (src + 2 < end) ? src[2] : '\0', + }; + + if (isspace(lookahead[0])) + goto next_char; + + switch (lookahead[0]) { + /* these single char sequences map directly to a unique token */ + case SCR_TOKEN_LPAREN: case SCR_TOKEN_RPAREN: + case SCR_TOKEN_LBRACK: case SCR_TOKEN_RBRACK: + case SCR_TOKEN_LBRACE: case SCR_TOKEN_RBRACE: + case SCR_TOKEN_LANGLE: case SCR_TOKEN_RANGLE: + case SCR_TOKEN_LSLASH: case SCR_TOKEN_RSLASH: + case SCR_TOKEN_COLON: case SCR_TOKEN_SEMICOLON: + case SCR_TOKEN_DOT: case SCR_TOKEN_COMMA: + case SCR_TOKEN_PLUS: case SCR_TOKEN_MINUS: + case SCR_TOKEN_STAR: case SCR_TOKEN_EQUALS: + token = ALLOC_SIZED(ctx->arena, struct script_token); + assert(token); + + token->type = (enum script_token_type) lookahead[0]; + goto next_char; + + /* TODO: multi-char sequences map to unique tokens */ + /* TODO: a string literal */ + /* TODO: a character literal */ + + /* an integer literal or float literal (TODO) */ + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + buf = buf_end = src; + + uint64_t value = strtoull(buf, &buf_end, 0); + if (ctx->verbose) + dbglog(ctx, "info: have integer literal: '%.*s'\n", + (int) (buf_end - buf), buf); + + if (errno == EINVAL) { + dbglog(ctx, "error: integer literal is invalid: '%.*s'\n", + (int) (buf_end - buf), buf); + goto error; + } else if (errno == ERANGE) { + dbglog(ctx, "warn: integer literal is out of range, truncating: " + "'%.*s'\n", (int) (buf_end - buf), buf); + } + + token = ALLOC_SIZED(ctx->arena, struct script_token); + assert(token); + + token->type = SCR_TOKEN_LITERAL_INT; + token->literal_int = value; + + src = buf_end; + break; + + /* anything else is a fragment of a ident or keyword */ + default: + if (!isalnum(lookahead[0]) && lookahead[0] != '_') { + dbglog(ctx, "error: unexpected character: %c\n", + lookahead[0]); + goto error; + } + + buf = buf_end = src; + while (isalnum(*buf_end) || *buf_end == '_') + buf_end++; + + if (ctx->verbose) + dbglog(ctx, "info: have ident or keyword: '%.*s'\n", + (int) (buf_end - buf), buf); + + token = ALLOC_SIZED(ctx->arena, struct script_token); + assert(token); + + if (try_tokenise_keyword(buf, buf_end, token) < 0) { + token->type = SCR_TOKEN_IDENT; + // TODO: get flystr + } + + src = buf_end; + + break; + } + + continue; + +next_char: + src++; + } + + return (struct token_stream) { + .ptr = ctx->arena->ptr, + .len = ctx->arena->len / sizeof *token, + .cur = 0, + }; + +error: + return (struct token_stream) {0}; +} + +static inline char const * +token_type_str(enum script_token_type type) +{ + static char const *type_to_str[] = { + [SCR_TOKEN_EOF] = "EOF", + [SCR_TOKEN_IDENT] = "IDENT", + [SCR_TOKEN_LITERAL_INT] = "LITERAL_INT", + [SCR_TOKEN_U64] = "U64", + [SCR_TOKEN_RETURN] = "RETURN", + [SCR_TOKEN_LPAREN] = "(", + [SCR_TOKEN_RPAREN] = ")", + [SCR_TOKEN_LBRACK] = "[", + [SCR_TOKEN_RBRACK] = "]", + [SCR_TOKEN_LBRACE] = "{", + [SCR_TOKEN_RBRACE] = "}", + [SCR_TOKEN_LANGLE] = "<", + [SCR_TOKEN_RANGLE] = ">", + [SCR_TOKEN_LSLASH] = "\\", + [SCR_TOKEN_RSLASH] = "/", + [SCR_TOKEN_COLON] = ":", + [SCR_TOKEN_SEMICOLON] = ";", + [SCR_TOKEN_DOT] = ".", + [SCR_TOKEN_COMMA] = ",", + [SCR_TOKEN_EQUALS] = "=", + [SCR_TOKEN_PLUS] = "+", + [SCR_TOKEN_MINUS] = "-", + [SCR_TOKEN_STAR] = "*", + }; + + return type_to_str[type]; +} + +static inline int +dump_token(struct script_token *token, char *buf, size_t cap) +{ + switch (token->type) { + case SCR_TOKEN_IDENT: + return snprintf(buf, cap, "Token {type: %s, ident: %" PRIu64 "}", + token_type_str(token->type), token->ident.v); + + case SCR_TOKEN_LITERAL_INT: + return snprintf(buf, cap, "Token {type: %s, literal_int: %" PRIu64 "}", + token_type_str(token->type), token->literal_int); + + default: + return snprintf(buf, cap, "Token {type: %s}", token_type_str(token->type)); + } +} + +static void +dump_token_stream(struct compile_ctx *ctx, struct script_token *toks, size_t len) +{ + struct script_token *end = toks + len; + + dbglog(ctx, "token stream: %zu tokens\n", len); + + char buf[64]; + while (toks < end) { + int written = dump_token(toks, buf, sizeof buf); + assert(written); + + dbglog(ctx, "\t%.*s\n", written, buf); + + toks++; + } + + dbglog(ctx, "\n"); +} + +static inline struct script_token +peek(struct compile_ctx *ctx, size_t off) +{ + if (ctx->stream->cur + off >= ctx->stream->len) + return (struct script_token) { .type = SCR_TOKEN_EOF, }; + + if (ctx->verbose) + dbglog(ctx, "peek(%zu/%zu), %d\n", ctx->stream->cur + off, + ctx->stream->len, ctx->stream->ptr[ctx->stream->cur + off].type); + + return ctx->stream->ptr[ctx->stream->cur + off]; +} + +static inline struct script_token +next(struct compile_ctx *ctx) +{ + if (ctx->stream->cur > ctx->stream->len) + return (struct script_token) { .type = SCR_TOKEN_EOF, }; + + if (ctx->verbose) + dbglog(ctx, "next(%zu/%zu), %d\n", ctx->stream->cur, ctx->stream->len, + ctx->stream->ptr[ctx->stream->cur].type); + + return ctx->stream->ptr[ctx->stream->cur++]; +} + +static inline struct script_token +expect(struct compile_ctx *ctx, enum script_token_type expected) +{ + struct script_token tok = next(ctx); + + if (ctx->verbose) + dbglog(ctx, "expect(%zu/%zu, T: %d), %d\n", + ctx->stream->cur, ctx->stream->len, expected, tok.type); + + if (tok.type != expected) { + int len = dump_token(&tok, ctx->scratch, sizeof ctx->scratch); + dbglog(ctx, "error: expected %s, got: %.*s\n", + token_type_str(expected), len, ctx->scratch); + PANIC(); + } + + return tok; +} + +static struct script_typeinfo * +primitive_typeinfo(enum script_type type) +{ + static struct script_typeinfo typeinfos[] = { + [SCR_TOKEN_U64] = { .type = SCR_TYPE_U64, .size = 8, .alignment = 8, }, + }; + + return &typeinfos[type]; +} + +static struct script_typeinfo * +literal_typeinfo(struct compile_ctx *ctx, enum script_token_type type) +{ + switch (type) { + case SCR_TOKEN_LITERAL_INT: // TODO: better rules surrounding literal types + return primitive_typeinfo(SCR_TYPE_U64); + + default: { + dbglog(ctx, "error: invalid token type has no type info: %s\n", + token_type_str(type)); + return NULL; + } break; + } +} + +static struct script_typeinfo * +parse_typeinfo(struct compile_ctx *ctx) +{ + struct script_token tok = next(ctx); + + switch (tok.type) { + case SCR_TOKEN_U64: + return primitive_typeinfo(SCR_TYPE_U64); + + default: { + int len = dump_token(&tok, ctx->scratch, sizeof ctx->scratch); + dbglog(ctx, "error: expected typeinfo, got: %.*s\n", + len, ctx->scratch); + return NULL; + } break; + } + +} + +static struct script_expr * +parse_expr(struct compile_ctx *ctx) +{ + struct script_expr *stack[128]; + size_t i = 0; + + while (i < sizeof stack) { + struct script_token tok = peek(ctx, 0); + + struct script_expr *expr; + switch (tok.type) { + case SCR_TOKEN_IDENT: + expr = ALLOC_SIZED(ctx->arena, struct script_expr); + assert(expr); + + tok = next(ctx); + + expr->type = SCR_EXPR_IDENT; + expr->ident = tok.ident; + + // TODO: fetch the previously registered typeinfo? + expr->typeinfo = primitive_typeinfo(SCR_TYPE_U64); + + break; + + case SCR_TOKEN_LITERAL_INT: + expr = ALLOC_SIZED(ctx->arena, struct script_expr); + assert(expr); + + tok = next(ctx); + + expr->type = SCR_EXPR_LITERAL_INT; + expr->literal_int = tok.literal_int; + expr->typeinfo = literal_typeinfo(ctx, tok.type); + + break; + + case SCR_TOKEN_PLUS: + case SCR_TOKEN_MINUS: + case SCR_TOKEN_STAR: + case SCR_TOKEN_RSLASH: + expr = ALLOC_SIZED(ctx->arena, struct script_expr); + assert(expr); + + expr->type = SCR_EXPR_BINARY_OP; + switch (next(ctx).type) { + case SCR_TOKEN_PLUS: expr->binary_op.type = SCR_BINARY_OP_ADD; break; + case SCR_TOKEN_MINUS: expr->binary_op.type = SCR_BINARY_OP_SUB; break; + case SCR_TOKEN_STAR: expr->binary_op.type = SCR_BINARY_OP_MUL; break; + case SCR_TOKEN_RSLASH: expr->binary_op.type = SCR_BINARY_OP_DIV; break; + default: UNREACHABLE(); break; + } + + assert(i >= 2); + expr->binary_op.rhs = stack[--i]; + expr->binary_op.lhs = stack[--i]; + + assert(expr->binary_op.lhs->typeinfo == expr->binary_op.rhs->typeinfo); + expr->typeinfo = expr->binary_op.lhs->typeinfo; + + break; + + default: + goto end; + } + + assert(i < sizeof stack); + stack[i++] = expr; + } + + if (i == sizeof stack) { + struct script_token tok = peek(ctx, 0); + int len = dump_token(&tok, ctx->scratch, sizeof ctx->scratch); + dbglog(ctx, "error: stack overflow while parsing expression: '%.*s'\n", + len, ctx->scratch); + return NULL; + } + +end: + return stack[0]; +} + +static struct script_stmt * +parse_return(struct compile_ctx *ctx) +{ + struct script_stmt *stmt = ALLOC_SIZED(ctx->arena, struct script_stmt); + assert(stmt); + + stmt->type = SCR_STMT_RET; + expect(ctx, SCR_TOKEN_RETURN); + stmt->ret.expr = parse_expr(ctx); + + expect(ctx, SCR_TOKEN_SEMICOLON); + + return stmt; +} + +static struct script_stmt * +parse_vardecl(struct compile_ctx *ctx) +{ + struct script_stmt *stmt = ALLOC_SIZED(ctx->arena, struct script_stmt); + assert(stmt); + + stmt->type = SCR_STMT_VARDECL; + stmt->vardecl.ident = expect(ctx, SCR_TOKEN_IDENT).ident; + expect(ctx, SCR_TOKEN_COLON); + stmt->vardecl.typeinfo = parse_typeinfo(ctx); + expect(ctx, SCR_TOKEN_EQUALS); + stmt->vardecl.expr = parse_expr(ctx); + + assert(stmt->vardecl.typeinfo == stmt->vardecl.expr->typeinfo); + + expect(ctx, SCR_TOKEN_SEMICOLON); + + return stmt; +} + +static struct script_stmt * +parse_statement(struct compile_ctx *ctx) +{ + struct script_token tok = peek(ctx, 0); + switch (tok.type) { + case SCR_TOKEN_RETURN: + return parse_return(ctx); + + case SCR_TOKEN_IDENT: + return parse_vardecl(ctx); + + default: { + int len = dump_token(&tok, ctx->scratch, sizeof ctx->scratch); + dbglog(ctx, "error: expected a statement, got: '%.*s'\n", + len, ctx->scratch); + return NULL; + } break; + } +} + +static struct script_stmt * +parse_statement_list(struct compile_ctx *ctx) +{ + struct script_stmt *stmt = ALLOC_SIZED(ctx->arena, struct script_stmt); + assert(stmt); + + stmt->type = SCR_STMT_BLOCK; + stmt->block.children.head = stmt->block.children.tail = NULL; + + while (peek(ctx, 0).type != SCR_TOKEN_EOF) { + struct script_stmt *child = parse_statement(ctx); + list_push_tail(&stmt->block.children, &child->list_node); + } + + return stmt; +} + +static struct script_stmt * +parse(struct compile_ctx *ctx) +{ + struct script_stmt *ast = parse_statement_list(ctx); + assert(ast); + + assert(ctx->stream->cur == ctx->stream->len); + + return ast; +} + +static void +dump_typeinfo(struct compile_ctx *ctx, struct script_typeinfo *typeinfo, size_t indent) +{ +#define leader(indent) \ + for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + + switch (typeinfo->type) { + case SCR_TYPE_U64: { + static char const *type_str[] = { + [SCR_TYPE_U64] = "U64", + }; + + leader(indent) + dbglog(ctx, "Typeinfo { type: %s, size: %zu, alignment: %zu }\n", + type_str[typeinfo->type], typeinfo->size, typeinfo->alignment); + } break; + } + +#undef leader +} + +static void +dump_expr(struct compile_ctx *ctx, struct script_expr *expr, size_t indent) +{ +#define leader(indent) \ + for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + + switch (expr->type) { + case SCR_EXPR_IDENT: + leader(indent) + dbglog(ctx, "ident: %" PRIu64 "\n", expr->ident.v); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, expr->typeinfo, indent + 2); + break; + + case SCR_EXPR_LITERAL_INT: + leader(indent) + dbglog(ctx, "literal int: %" PRIu64 "\n", expr->literal_int); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, expr->typeinfo, indent + 2); + break; + + case SCR_EXPR_BINARY_OP: { + static char const *binary_op_str[] = { + [SCR_BINARY_OP_ADD] = "+", + [SCR_BINARY_OP_SUB] = "-", + [SCR_BINARY_OP_MUL] = "*", + [SCR_BINARY_OP_DIV] = "/", + }; + + leader(indent) + dbglog(ctx, "binary op: %s\n", binary_op_str[expr->binary_op.type]); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, expr->typeinfo, indent + 2); + + leader(indent + 1) + dbglog(ctx, "lhs:\n"); + dump_expr(ctx, expr->binary_op.lhs, indent + 2); + + leader(indent + 1) + dbglog(ctx, "rhs:\n"); + dump_expr(ctx, expr->binary_op.rhs, indent + 2); + } break; + } + +#undef leader +} + +static void +dump_parse_tree(struct compile_ctx *ctx, struct script_stmt *node, size_t indent) +{ +#define leader(indent) \ + for (size_t i = 0; i < indent; i++) dbglog(ctx, " "); + + struct script_list *list; + switch (node->type) { + case SCR_STMT_BLOCK: + leader(indent) + dbglog(ctx, "block\n"); + + list = &node->block.children; + SCRIPT_LIST_ITER(list) { + struct script_stmt *child = + SCRIPT_FROM_NODE(it, struct script_stmt, list_node); + + dump_parse_tree(ctx, child, indent + 1); + } + break; + + case SCR_STMT_VARDECL: + leader(indent) + dbglog(ctx, "vardecl: ident: %" PRIu64 "\n", node->vardecl.ident.v); + + leader(indent + 1) + dbglog(ctx, "typeinfo:\n"); + dump_typeinfo(ctx, node->vardecl.typeinfo, indent + 2); + + leader(indent + 1) + dbglog(ctx, "expr:\n"); + dump_expr(ctx, node->vardecl.expr, indent + 2); + break; + + case SCR_STMT_RET: + leader(indent) + dbglog(ctx, "return\n"); + + leader(indent + 1) + dbglog(ctx, "expr:\n"); + dump_expr(ctx, node->ret.expr, indent + 2); + break; + } + +#undef leader +} + +int +script_parse(char *src, size_t src_len, void *mem, size_t mem_len, + struct script_stmt **out, FILE *errstream, int verbose) +{ + struct compile_ctx ctx = { + .errstream = errstream, + .verbose = verbose, + }; + + struct arena arena = { + .ptr = mem, + .cap = mem_len, + .len = 0, + }; + + if (ctx.verbose) + dbglog(&ctx, "info: arena cap: %zu bytes, verbose: %d\n", + arena.cap, verbose); + + ctx.arena = &arena; + + struct token_stream stream = tokenise(&ctx, src, src_len); + if (!stream.ptr) { + dbglog(&ctx, "error: failed to tokenise source\n"); + return -1; + } + + if (ctx.verbose) + dump_token_stream(&ctx, stream.ptr, stream.len); + + ctx.stream = &stream; + + struct script_stmt *stmt = parse(&ctx); + if (!stmt) { + dbglog(&ctx, "error: failed to parse source\n"); + return -1; + } + + if (verbose) + dump_parse_tree(&ctx, stmt, 0); + + *out = stmt; + + return 0; +} + +#include "utils.c" diff --git a/libscript/libscript.h b/libscript/libscript.h @@ -0,0 +1,172 @@ +#ifndef LIBSCRIPT_H +#define LIBSCRIPT_H + +#include <assert.h> +#include <errno.h> +#include <stdalign.h> +#include <stdarg.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* utilities + * =========================================================================== + */ + +#define SCRIPT_TO_PARENT(child_ptr, T, member) \ + ((child_ptr) ? (T *) ((uintptr_t) child_ptr - offsetof(T, member)) : NULL) + +struct script_flystr { + uint64_t v; +}; + +struct script_list_node { + struct script_list_node *prev, *next; +}; + +#define SCRIPT_FROM_NODE(node, T, member) SCRIPT_TO_PARENT(node, T, member) + +#define SCRIPT_NODE_ITER(node) \ + for (struct script_list_node *it = (node); it; it = it->next) + +#define SCRIPT_NODE_RITER(node) \ + for (struct script_list_node *it = (node); it; it = it->prev) + +struct script_list { + struct script_list_node *head, *tail; +}; + +#define SCRIPT_LIST_ITER(list) SCRIPT_NODE_ITER((list)->head) +#define SCRIPT_LIST_RITER(list) SCRIPT_NODE_RITER((list)->tail) + +/* lexer + * =========================================================================== + */ + +enum script_token_type { + SCR_TOKEN_EOF, + + /* literals */ + SCR_TOKEN_IDENT, + SCR_TOKEN_LITERAL_INT, + + /* keywords */ + SCR_TOKEN_U64, + SCR_TOKEN_RETURN, + + /* punctuation */ + SCR_TOKEN_LPAREN = '(', + SCR_TOKEN_RPAREN = ')', + SCR_TOKEN_LBRACK = '[', + SCR_TOKEN_RBRACK = ']', + SCR_TOKEN_LBRACE = '{', + SCR_TOKEN_RBRACE = '}', + + SCR_TOKEN_LANGLE = '<', + SCR_TOKEN_RANGLE = '>', + SCR_TOKEN_LSLASH = '\\', + SCR_TOKEN_RSLASH = '/', + + SCR_TOKEN_COLON = ':', + SCR_TOKEN_SEMICOLON = ';', + SCR_TOKEN_DOT = '.', + SCR_TOKEN_COMMA = ',', + + SCR_TOKEN_EQUALS = '=', + SCR_TOKEN_PLUS = '+', + SCR_TOKEN_MINUS = '-', + SCR_TOKEN_STAR = '*', +}; + +struct script_token { + enum script_token_type type; + + union { + struct script_flystr ident; + uint64_t literal_int; + }; +}; + +/* parser + * =========================================================================== + */ + +enum script_type { + SCR_TYPE_U64, +}; + +struct script_typeinfo { + enum script_type type; + + size_t size; + size_t alignment; +}; + +enum script_expr_type { + SCR_EXPR_IDENT, + SCR_EXPR_LITERAL_INT, + SCR_EXPR_BINARY_OP, +}; + +struct script_expr { + enum script_expr_type type; + + struct script_typeinfo *typeinfo; + + union { + struct script_flystr ident; + + uint64_t literal_int; + + struct { + enum { + SCR_BINARY_OP_ADD, + SCR_BINARY_OP_SUB, + SCR_BINARY_OP_MUL, + SCR_BINARY_OP_DIV, + } type; + + struct script_expr *lhs, *rhs; + } binary_op; + }; +}; + +enum script_stmt_type { + SCR_STMT_BLOCK, + SCR_STMT_VARDECL, + SCR_STMT_RET, +}; + +struct script_stmt { + enum script_stmt_type type; + + union { + struct { + struct script_list children; + } block; + + struct { + struct script_flystr ident; + struct script_typeinfo *typeinfo; + struct script_expr *expr; + } vardecl; + + struct { + struct script_expr *expr; + } ret; + }; + + struct script_list_node list_node; +}; + +/* libscript + * =========================================================================== + */ + +extern int +script_parse(char *src, size_t src_len, void *mem, size_t mem_len, + struct script_stmt **out, FILE *errstream, int verbose); + +#endif /* LIBSCRIPT_H */ diff --git a/libscript/libscript_internal.h b/libscript/libscript_internal.h @@ -0,0 +1,95 @@ +#ifndef LIBSCRIPT_INTERNAL_H +#define LIBSCRIPT_INTERNAL_H + +#include "libscript.h" + +#include <ctype.h> +#include <inttypes.h> +#include <limits.h> + +#define UNREACHABLE() (*((volatile char *) 0) = 0) +#define PANIC() UNREACHABLE() + +#define ALIGN_PREV(v, align) ((v) & ~((align) - 1)) +#define ALIGN_NEXT(v, align) ALIGN_PREV((v) + ((align) - 1), (align)) + +struct arena { + void *ptr; + size_t cap, len; +}; + +inline void +arena_reset(struct arena *arena) +{ + arena->len = 0; +} + +inline void * +arena_alloc(struct arena *arena, size_t size, size_t align) +{ + size_t aligned_len = ALIGN_NEXT(arena->len, align); + if (aligned_len + size > arena->cap) + return NULL; + + void *ptr = (void *) ((uintptr_t) arena->ptr + aligned_len); + arena->len = aligned_len + size; + + return ptr; +} + +#define ALLOC_ARRAY(arena, T, n) \ + arena_alloc((arena), sizeof(T) * (n), alignof(T)) + +#define ALLOC_SIZED(arena, T) ALLOC_ARRAY((arena), T, 1) + +inline void +list_push_head(struct script_list *restrict list, + struct script_list_node *restrict node) +{ + if (!list->tail) + list->tail = node; + + if (list->head) + list->head->prev = node; + + node->next = list->head; + list->head = node; +} + +inline void +list_push_tail(struct script_list *restrict list, + struct script_list_node *restrict node) +{ + if (!list->head) + list->head = node; + + if (list->tail) + list->tail->next = node; + + node->prev = list->tail; + list->tail = node; +} + +inline struct script_list_node * +list_pop_head(struct script_list *list) +{ + if (!list->head) + return NULL; + + struct script_list_node *node = list->head; + list->head = node->next; + return node; +} + +inline struct script_list_node * +list_pop_tail(struct script_list *list) +{ + if (!list->tail) + return NULL; + + struct script_list_node *node = list->tail; + list->tail = node->prev; + return node; +} + +#endif /* LIBSCRIPT_INTERNAL_H */ diff --git a/libscript/utils.c b/libscript/utils.c @@ -0,0 +1,21 @@ +#include "libscript_internal.h" + +extern inline void +arena_reset(struct arena *arena); + +extern inline void * +arena_alloc(struct arena *arena, size_t size, size_t align); + +extern inline void +list_push_head(struct script_list *restrict list, + struct script_list_node *restrict node); + +extern inline void +list_push_tail(struct script_list *restrict list, + struct script_list_node *restrict node); + +extern inline struct script_list_node * +list_pop_head(struct script_list *list); + +extern inline struct script_list_node * +list_pop_tail(struct script_list *list); diff --git a/scriptcc/scriptcc.c b/scriptcc/scriptcc.c @@ -0,0 +1,7 @@ +#include "libscript.h" + +int +main(int argc, char **argv) +{ + return 0; +} diff --git a/scriptvm/scriptvm.c b/scriptvm/scriptvm.c @@ -0,0 +1,142 @@ +#include "libscript.h" + +#include <stdio.h> +#include <getopt.h> + +#define MiB (1024 * 1024) + +struct { + int verbose; + FILE *logfile; + uint64_t mem; + + struct { + char **ptr; + size_t len; + } sources; +} opts = { + .verbose = 0, + .logfile = NULL, + .mem = 8192 * 1024, + + .sources.ptr = NULL, + .sources.len = 0, +}; + +#define OPTSTR "hvf:m:" + +static void +usage(char *prog) +{ + fprintf(stderr, "Usage: %s [-hv] [-m <mem-cap-mib>] sources...\n", prog); + fprintf(stderr, "\t-h : display usage information\n"); + fprintf(stderr, "\t-v : enable verbose logging\n"); + fprintf(stderr, "\t-f : file to log compilation errors to (default: stderr)\n"); + fprintf(stderr, "\t-m : maximum memory for compilation, in MiB (default: 8 MiB)\n"); + fprintf(stderr, "\tsources... : the source files to interpret\n"); +} + +static int +parse_opts(int argc, char **argv) +{ + int opt; + while ((opt = getopt(argc, argv, OPTSTR)) > 0) { + switch (opt) { + case 'v': + opts.verbose = 1; + break; + + case 'f': + if (!(opts.logfile = fopen(optarg, "w+"))) { + fprintf(stderr, "Failed to open logfile: %s\n", optarg); + return -1; + } + break; + + case 'm': + if (!(opts.mem = strtoull(optarg, NULL, 0) * MiB)) { + fprintf(stderr, "Failed to parse memory limit: %s\n", optarg); + return -1; + } + break; + + default: + return -1; + } + } + + if (!opts.logfile) + opts.logfile = stderr; + + opts.sources.ptr = argv + optind; + opts.sources.len = argc - optind; + + if (!opts.sources.len) { + fprintf(stderr, "Failed to provide source files\n"); + return -1; + } + + return 0; +} + +static void +interpret(struct script_stmt *ast) +{ + (void) ast; + + // TODO: interpret it +} + +int +main(int argc, char **argv) +{ + if (parse_opts(argc, argv)) { + usage(argv[0]); + exit(EXIT_FAILURE); + } + + void *mem = malloc(opts.mem); + assert(mem); + + memset(mem, 0, opts.mem); + + size_t mem_len = opts.mem; + + // TODO: mmap files in? read them into a single buffer? + for (size_t i = 0; i < opts.sources.len; i++) { + char *source = opts.sources.ptr[i]; + fprintf(stderr, "Interpreting source file: %s\n", source); + + FILE *fp = fopen(source, "r"); + if (!fp) { + fprintf(stderr, "Failed to open source file: %s\n", source); + continue; + } + + fseek(fp, 0, SEEK_END); + size_t src_len = ftell(fp); + rewind(fp); + + char *src = malloc(src_len); + assert(src); + + size_t nbytes = fread(src, 1, src_len, fp); + assert(nbytes == src_len); + + fclose(fp); + + fprintf(stderr, "\tRead %zu bytes of file, parsing...\n", src_len); + + struct script_stmt *ast; + if (script_parse(src, src_len, mem, mem_len, &ast, + opts.logfile, opts.verbose) < 0) { + fprintf(stderr, "Failed to parse source file: %s\n", source); + continue; + } + + // emit(ast); // TODO: dump out bytecode to a file? + interpret(ast); + } + + exit(EXIT_SUCCESS); +}