From c573be476bc67892191aee937226ffcd590aa1db Mon Sep 17 00:00:00 2001 From: Peter McGoron Date: Wed, 19 Jun 2024 23:27:14 -0400 Subject: [PATCH] tokenizer --- Makefile | 6 ++ README.rst | 8 ++ main.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++++++++ prelude.scm | 203 ++++++++++++++++++++++++++++++++++++ 4 files changed, 505 insertions(+) create mode 100644 Makefile create mode 100644 README.rst create mode 100644 main.c create mode 100644 prelude.scm diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cc596b2 --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +UNIVERSAL_SERVICE_DIR=universalservice +CFLAGS=-I${UNIVERSAL_SERVICE_DIR}/include -g -I${UNIVERSAL_SERVICE_DIR}/examples/string -std=c89 -Wall -pedantic -Werror +LDFLAGS=-L${UNIVERSAL_SERVICE_DIR} -luniversalservice -L${UNIVERSAL_SERVICE_DIR}/examples/string -lunsstring + +flatrate: main.c + $(CC) main.c -o flatrate $(CFLAGS) $(LDFLAGS) diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..e531dc1 --- /dev/null +++ b/README.rst @@ -0,0 +1,8 @@ +============= +Flatrate LISP +============= + +Flatrate is a minimalistic LISP. It's designed to test out my garbage +collectors. + +The default macro system is non-hygenic. diff --git a/main.c b/main.c new file mode 100644 index 0000000..7ec3acb --- /dev/null +++ b/main.c @@ -0,0 +1,288 @@ +/* Copyright (c) 2024, Peter McGoron + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1) Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include "uns.h" +#include "uns_string.h" +#include "cheney_c89.h" + +static struct uns_gc gc; + +enum token_type { + T_EOF, + LPAREN, + RPAREN, + QUOTE, + QUASIQUOTE, + UNQUOTE, + UNQUOTE_LIST, + IDENT, + NUMBER, + STRING, + TOKEN_NUM +}; + +static const char *token2string[TOKEN_NUM] = { + "EOF", + "(", + ")", + "'", + "`", + ",", + ",@", + "IDENT", + "NUMBER", + "STRING" +}; + +struct token { + enum token_type typ; + struct uns_ctr dat; + long i; +}; + +static int is_ws(int c) +{ + return c == '\n' || c == '\t' || c == '\r' || c == '\v' || c == ' '; +} + +static int get_skipws(FILE *input) +{ + int c; + + for (;;) { + c = getc(input); + if (c == ';') { + do { c = getc(input); } while (c != '\n'); + ungetc(c, input); + continue; + } + + if (is_ws(c)) + continue; + return c; + } +} + +static void tok_string(FILE *input, struct token *tok) +{ + int c; + + tok->typ = STRING; + uns_string_alloc(&gc, &tok->dat, 32); + for (;;) { + c = getc(input); + switch (c) { + case '\\': + c = getc(input); + switch (c) { + case ' ': case '\t': case '\r': case '\n': case '\v': + do { + c = getc(input); + } while (is_ws(c)); + ungetc(c, input); + continue; + case 'r': + uns_string_append_char(&gc, &tok->dat, '\r'); + continue; + case 'n': + uns_string_append_char(&gc, &tok->dat, '\n'); + continue; + case 'v': + uns_string_append_char(&gc, &tok->dat, '\v'); + continue; + case 't': + uns_string_append_char(&gc, &tok->dat, '\t'); + continue; + case '"': + uns_string_append_char(&gc, &tok->dat, '"'); + continue; + default: + uns_string_append_char(&gc, &tok->dat, '\\'); + uns_string_append_char(&gc, &tok->dat, c); + } + case '"': /* " */ + return; + } + } +} + +static int tonum(int c) +{ + switch (c) { + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + return c - '0'; + default: + return -1; + } +} + +static void tok_num(FILE *input, struct token *tok, int mul) +{ + int c = getc(input); + + tok->typ = NUMBER; + uns_string_alloc(&gc, &tok->dat, 32); + do { + uns_string_append_char(&gc, &tok->dat, c); + c = getc(input); + } while (tonum(c) >= 0); + ungetc(c, input); + + tok->i = mul*strtol(uns_string_cstring(&gc, &tok->dat), NULL, 10); + tok->dat.p = NULL; +} + +static int part_of_ident(int c) +{ + return !is_ws(c) && c != '(' && c != ')' && c != '`' + && c != '\'' && c != '\\' && c != '\"' && c != ',' + && c != ';'; +} + +static void tok_ident(FILE *input, struct token *tok, int c) +{ + tok->typ = IDENT; + + uns_string_alloc(&gc, &tok->dat, 32); + do { + uns_string_append_char(&gc, &tok->dat, c); + c = getc(input); + } while (part_of_ident(c)); + ungetc(c, input); +} + +static void tokenize(FILE *input, struct token *tok) +{ + int c = get_skipws(input); + int c2; + + tok->dat.p = NULL; + + switch (c) { + case EOF: + tok->typ = T_EOF; + return; + case '(': + tok->typ = LPAREN; + return; + case ')': + tok->typ = RPAREN; + return; + case '\'': + tok->typ = QUOTE; + return; + case '`': + tok->typ = QUASIQUOTE; + return; + case ',': + c = getc(input); + if (c == '@') { + tok->typ = UNQUOTE_LIST; + } else { + ungetc(c, input); + tok->typ = UNQUOTE; + } + return; + case '"': + tok_string(input, tok); + return; + case '+': case '-': + c2 = getc(input); + ungetc(c2, input); + if (tonum(c2) >= 0) { + tok_num(input, tok, c == '+' ? 1 : -1); + return; + } + default: + if (tonum(c) >= 0) + tok_num(input, tok, 1); + else + tok_ident(input, tok, c); + return; + } +} + +static void oom(struct uns_gc *gc_) +{ + (void)gc_; + printf("oom\n"); + exit(1); +} + +static void after_gc(struct uns_gc *gc_) +{ + (void)gc_; + + if (gc.after_collection >= gc.next_alloc * 7/10) + gc.next_alloc *= 2; +} + +int main(void) +{ + FILE *input = stdin; + struct token tok; + int indent = 0; + int i; + + gc.next_alloc = 512; + gc.oom = oom; + gc.after_gc = after_gc; + + gc.ctx = malloc(uns_cheney_c89_ctx_size); + if(!gc.ctx || !uns_cheney_c89_init(&gc)) + return 1; + uns_root_add(&gc, &tok.dat); + + do { + tokenize(input, &tok); + if (tok.typ == LPAREN) + indent++; + else if (tok.typ == RPAREN) + indent--; + + for (i = 0; i < indent; i++) + printf(" "); + + printf("%s ", token2string[tok.typ]); + switch (tok.typ) { + case STRING: case IDENT: + printf("[%s] ", uns_string_cstring(&gc, &tok.dat)); + break; + case NUMBER: + printf("[%ld] ", tok.i); + break; + default: + break; + } + printf("\n"); + } while (tok.typ != T_EOF); + + uns_cheney_c89_deinit(&gc); + return 0; +} diff --git a/prelude.scm b/prelude.scm new file mode 100644 index 0000000..097ca64 --- /dev/null +++ b/prelude.scm @@ -0,0 +1,203 @@ +; Copyright (c) 2024, Peter McGoron +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; 1) Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; 2) Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +; Primitives are special syntatic forms. They differ from built-in +; functions. +; +; primitives: +; __unilambda: lambda of one argument +; __define: define in the environment +; __define-macro: Like __define but for macros +; set! +; if +; +; Others: +; __uniapply: Apply function to single argument +; __null: A value that can only be passed to __unilambda. It cannot be +; bound to a top-level, or "set!", or added to a list. +; +; Macros are functions that return an sexpr. + +(__define-macro let + (__unilambda l ; car = args, cdr = body + (if (null? (car args)) + (cdr args) + ; (car args) = ((A a) (B b) ...) + ; (car (car args)) = (A a) + ; (cdr (car (car args))) = (a) + `((__unilambda ,(car (car (car args))) + (let ,(cdr (car args)) ,@body)) + ,(car (cdr (car (car args)))) + ) + ) + ) +) + +(__define-macro let* let) + +(__define-macro __bindlambda + (__unilambda l + (let ((larg (car l)) + (args (car (cdr l))) + (body (cdr (cdr l))) + ) + (if (null? args) + `(__unilambda larg + (if (not (null? larg)) + (raise "incorrect number of arguments") + ,body + ) + ) + (let* ((argval (cons args)) + (rest (cdr args)) + (arg (cons argval)) + (val (cons (cdr argval))) + ) + `(__unilambda ,larg + (let ((,arg ,val)) __bindlambda ,larg ,rest ,@body) + ) + ) + ) + ) + ) +) + +(__define-macro lambda + (__unilambda l + (let ((args (car l)) + (body (cdr l)) + ) + (if (symbol? args) + `(__unilambda ,args ,@body) + (let ((larg (gensym))) + (if (null? args) + `(__unilambda ,larg ,@body) + `(__bindlambda ,larg . body) + ) + ) + ) + ) + ) +) + +(__define-macro define-macro + (__unilambda l + (let* ((name-and-args (car l)) + (name (car name-and-args)) + (args (cdr name-and-args)) + (body (cdr l)) + (tmpname (gensym)) + ) + `(__define-macro ,name + (lambda ,args ,body) + ) + ) + ) +) + +(define-macro (define name . body) + (if (symbol? name) + `(__define ,name ,@body) + `(__define ,(car name) (lambda ,(cdr name) ,@body)) + ) +) + +(define-macro (begin . body) + `((lambda () ,@body)) +) + +(define-macro (and . body) + (if (null? body) + 1 + (let ((first (car body)) + (rest (cdr body)) + ) + `(if ,first (and ,@rest) 0) + ) + ) +) + +(define-macro (or . body) + (if (null? body) + 0 + (let ((first (car body)) + (rest (cdr body)) + ) + `(if ,first 1 (or ,@rest)) + ) + ) +) + +(define-macro (letrec args . body) + (if (null? args) + body + (let* ((argval (car args)) + (arg (car argval)) + (val (car (cdr argval))) + (rest (cdr args)) + ) + `(let ((,arg __null)) + (letrec ,rest (set! ,arg ,val) ,@body) + ) + ) + ) +) + +;; for a list of (v1 v2 ... vn) +;; runs (f vn (... (f v2 (f v1 start)))) +(define (foldl f start l) + (letrec + ((loop + (lambda (ret-list cur-list) + (if (eqv? value '()) + value + (loop (f (car cur-list) value) + (cdr cur-list)) + ) + ) + )) + (loop start l) + ) +) + +(define (reverse l) (foldl cons '() l)) + +;; for a list of (v1 v2 ... vn) +;; runs (f v1 (f v2 (... (f vn start)))) +(define (foldr f start l) (foldl f start (reverse l))) + +(define (append . l) + (foldr (lambda (to-prepend collected) + (foldr cons collected to-prepend) + ) + '() + l + ) +) + +(define (apply f . l) (__uniapply f (__uniapply append l))) +(define (list . l) l) + +;; (define (+ . l) (foldl __bin+ 0 l)) +;; (define (* . l) (foldl __bin* 1 l))