;;; Copyright (C) Peter McGoron 2024 ;;; This program is free software: you can redistribute it and/or modify ;;; it under the terms of the GNU General Public License as published by ;;; the Free Software Foundation, version 3 of the License. ;;; ;;; This program is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with this program. If not, see . ;;; R7RS reader. This is the lexer-parser end, so it returns tokens and ;;; not concrete objects. ;;; ;;; Notes: ;;; ;;; Port stores datum labels. Datum labels are stored for the entirety of ;;; a READ: this is to emulate MIT Scheme, which allows for datum labels ;;; outside of the datum that the label appears in. ;;; ;;; The reader does not return Scheme data: it returns annotated data ;;; containing the source location, datum label number, resolved datum ;;; label pointer. This is for advanced syntax systems. ;;; ;;; How datum labels could work: ;;; ;;; When encountering #[number]=, allocate a datum label and assign it ;;; nothing. Then call READ after "=", and destructively update the ;;; datum label with the resulting datum. A pass over the new read ;;; structure to convert it to regular Scheme data will resolve the ;;; indirection. ;;; ;;; All tokens are procedure-encapsulated objects, since the reader should ;;; never return a literal procedure. Each procedure has a TYPE message. (load "chez-compat.scm") (load "util.scm") (load "set.scm") (load "linked-list.scm") ;;; My text editor cannot parse Scheme's character syntax. (define %bol #\() (define %eol #\)) ;;; ;;;;;;;;;;;;;;;;;;;;;;;; ;;; Port reader wrapper ;;; ;;;;;;;;;;;;;;;;;;;;;;;; (define port->read-function (lambda (port) (lambda () (read-char port)))) ;;; READ: ;;; ;;; Stream readers contain mutable state. This is the case-folding mode ;;; and the current list of datum labels. ;;; ;;; (POS): Return (LIST FILENAME LINE-NUMBER OFFSET). ;;; (READ): Read the next character in the stream. Returns #F on EOF. ;;; (PUSH CHAR): Push CHAR such that it will be the next character read ;;; when (READ) is called. ;;; (PEEK): Read character, push it back, and return it. ;;; (FOLD-CASE?): Returns a boolean if case folding is enabled. ;;; (FOLD-CASE! BOOL): Sets the case folding to BOOL. (define port->read (lambda (read-function filename) (let ((line-number 1) (offset 0) (pushback-buffer '()) (datum-labels '()) (fold-case? #f)) (letrec ((update-position! (lambda (ch) (cond ((eqv? ch #\newline) (set! line-number (+ 1 line-number)) (set! offset 0)) ;; OFFSET is sometimes set to #F to denote an unknown ;; offset. (offset (set! offset (+ 1 offset)))))) (location (lambda () (list filename line-number offset))) (set-datum-label! (lambda (label value) (set! datum-labels (car (smap:insert datum-labels label value))))) (get-datum-label (lambda (label) (smap:search datum-labels label))) (clear-datum-labels! (lambda () (set! datum-labels '()))) (dump-mutable (lambda () (list datum-labels fold-case?))) (restore-mutable! (lambda (state) (set! datum-labels (car state)) (set! fold-case? (cadr state)))) (process (lambda (ch) (update-position! ch) (cond ((or (eof-object? ch) (not ch)) ch) (fold-case? (char-downcase ch)) (else ch)))) (port (lambda (op . args) ;; TODO: turn into string map? (cond ((eq? op 'location) (location)) ((eq? op 'read) (process (if (null? pushback-buffer) (read-function) (let ((ch (car pushback-buffer))) (set! pushback-buffer (cdr pushback-buffer)) ch)))) ((eq? op 'peek) (let ((ch (port 'read))) (port 'push ch) ch)) ((eq? op 'push) (let ((ch (car args))) (if (eqv? ch #\newline) (begin (set! line-number (- line-number 1)) (set! offset #f)) (set! offset (- offset 1))) (set! pushback-buffer (cons ch pushback-buffer)))) ((eq? op 'fold-case?) fold-case?) ((eq? op 'fold-case!) (set! fold-case? (car args))) ((eq? op 'set-datum-label!) (apply set-datum-label! args)) ((eq? op 'get-datum-label) (apply get-datum-label args)) ((eq? op 'clear-datum-labels!) (apply clear-datum-labels! args)) ((eq? op 'dump-mutable) (apply dump-mutable args)) ((eq? op 'restore-mutable!) (apply restore-mutable! args)) (else (error 'read->port 'invalid (cons op args))))))) port)))) ;;; ;;;;;;;;;;;;;; ;;; Character maps ;;; ;;;;;;;;;;;;;; (define integer<=> (lambda (x y) (cond ((< x y) '<) ((= x y) '=) (else '>)))) ;;; Comparison on characters extended to #F, which is less than all ;;; characters. (define char*<=> (lambda (x y) (cond ((and (not x) y) '<) ((and x (not y)) '>) ((and (not x) (not y) '=)) (else (integer<=> (char->integer x) (char->integer y)))))) (define %charmap:<=> (set:<=>-to-map char*<=>)) (define %charmap:update (set:update %charmap:<=>)) (define charmap:update (map:update %charmap:update)) (define charmap:insert (map:insert %charmap:update)) (define charmap:search (map:search %charmap:<=>)) ;;; ;;;;;;;;;;;;;;;;;;;;;; ;;; Readtable constructors ;;; ;;; Readtables are composed of a CHARMAP, which is a map from characters ;;; to actions, and a DEFAULT-ACTION, which is taken when there is no ;;; match in CHARMAP. ;;; ;;; An "action" is a procedure that takes four arguments: ;;; ;;; TABLE: The current table. ;;; CHAR: The character that was matched against the CHARMAP in TABLE. ;;; ACC: An arbitrary "accumulator" value that is different depending ;;; on the readtable in question. ;;; PORT: A port reader object. ;;; ;;;;;;;;;;;;;;;;;;;;;; ;;; (READTABLE:NEW DEFAULT-ACTION CHARMAP) (define readtable:new cons) (define %readtable:default-action car) (define %readtable:charmap cdr) ;;; Run the action in TABLE assigned to CHAR, or the default action of ;;; TABLE if there is no entry for CHAR. (define readtable:act (lambda (table char acc port) (let ((node (charmap:search (%readtable:charmap table) char))) (let ((action (if (null? node) (%readtable:default-action table) (map:val node)))) (action table char acc port))))) ;;; Run the action in TABLE with the next character from PORT. (define readtable:next (lambda (table acc port) (readtable:act table (port 'read) acc port))) ;;; Return a new readtable where CHAR is bound to ACTION. (define readtable:update (lambda (table char action) (readtable:new (%readtable:default-action table) (car (charmap:insert (%readtable:charmap table) char action))))) ;;; Update TABLE to act on all characters in LST with ACTION. (define readtable:update-list (lambda (table lst action) (fold (lambda (char table) (readtable:update table char action)) table lst))) ;;; Construct new readtable with no characters in its map and ;;; DEFAULT-ACTION as the default action. (define readtable:empty/default (lambda (default-action) (readtable:new default-action '()))) ;;; Each value in FUNCTIONS is a list (PROCEDURE ARGS...) which is called ;;; like (PROCEDURE TABLE ARGS...) and returns a table. (define readtable:process (lambda (table . functions) (fold (lambda (function table) (apply (car function) table (cdr function))) table functions))) ;;; ;;;;;;;;;;;;;;;;;; ;;; Default actions ;;; ;;;;;;;;;;;;;;;;;; ;;; Return an error. (define readtable:error (lambda emsg (lambda (table char acc port) (error emsg char acc table port)))) ;;; Discard the current character and continue reading the readtable. (define readtable:skip (lambda (table char acc port) (readtable:act table (port 'read) acc port))) ;;; Discard char and return constant. (define readtable:return (lambda (return) (lambda (table char acc port) return))) ;;; Jump to a new readtable, discard it's return, and continue reading ;;; in the table. (define readtable:jump-discard (lambda (newtable) (lambda (oldtable char acc port) (readtable:act newtable (port 'read) '() port) (readtable:act oldtable (port 'read) acc port)))) ;;; Jump to a new readtable with the same characters. (define readtable:jump (lambda (newtable) (lambda (oldtable char acc port) (readtable:act newtable char acc port)))) ;;; Jump to a new readtable, reading the new character, with the old ;;; readtable as ACC. (define readtable:next/old-as-acc (lambda (newtable) (lambda (oldtable __ _ port) (readtable:next newtable oldtable port)))) ;;; Jump to a new readtable, reading the new character. (define readtable:jump/next (lambda (newtable) (lambda (oldtable _ acc port) (readtable:next newtable acc port)))) ;;; ;;;;;;;;;;;;;;;;; ;;; Identifier reader ;;; ;;;;;;;;;;;;;;;;; (define read:ident (lambda (name location) (lambda (op . args) (cond ((eq? op 'type) 'ident) ((eq? op 'value) name) (else (error 'read:ident "invalid operation" op args)))))) (define read:ident-builder (lambda (location . start-char) (let ((char-list (linked-list:new))) (if (not (null? start-char)) (char-list 'push (car start-char))) (lambda (op . args) (cond ((eq? op 'finalize->ident) (read:ident (list->string (char-list 'to-list)) location)) (else (apply char-list op args))))))) ;;; Push back CHAR and return ACC. (define readtable:return-acc-as-ident (lambda (table char acc port) (port 'push char) (acc 'finalize->ident))) ;;; Push CHAR to ACC and continue reading from TABLE. (define readtable:push-char (lambda (table char acc port) (acc 'push-tail char) (readtable:act table (port 'read) acc port))) ;;; Define a readtable that constructs an identifier by accepting all ;;; characters that are not listed. (define readtable:exclude-from-identifiers (lambda (table excluded) (fold (lambda (char table) (readtable:update table char readtable:return-acc-as-ident)) table excluded))) ;;; ASCII whitespace. (define readtable:ASCII-whitespace (list #\newline #\space (integer->char #x09) (integer->char #x0B) (integer->char #x0C) (integer->char #x0D))) ;;; Readtable for identifiers. (define readtable:identifier (readtable:process (readtable:empty/default readtable:push-char) (list readtable:exclude-from-identifiers readtable:ASCII-whitespace) (list readtable:exclude-from-identifiers (list #\| %bol %eol #\' #\; #f)))) ;;; Read an identifier starting with CHAR. (define readtable:read-ident (lambda (table char acc port) (readtable:act readtable:identifier (port 'read) (read:ident-builder (port 'location) char) port))) ;;; ;;;;;;;;;;;;;;;;;;;; ;;; Comments and whitespace reader ;;; ;;;;;;;;;;;;;;;;;;;; ;;; Readtable for a line comment. (define readtable:read-to-newline (readtable:process (readtable:empty/default readtable:skip) (list readtable:update #\newline (readtable:return #f)))) ;;; ;;;;;;;;;;; ;;; List reader ;;; ;;; The reader updates the previous readtable to handle ). This means ;;; that this read table does not have to handle end-of-line, whitespace, ;;; etc. ;;; ;;;;;;;;;;; ;;; Read the end of an improper list. (define readtable:read-improper-cdr (lambda (table acc port) (let ((val (readtable:act (readtable:update table %eol (readtable:error 'read-improper-cdr "proper list must have cdr")) (port 'read) #f port))) (acc 'set-cdr! val) (let ((table (readtable:process (readtable:empty/default (readtable:error 'read-improper-cdr "improper list has 1 cdr")) (list readtable:update-list readtable:ASCII-whitespace readtable:skip) (list readtable:update %eol (lambda dummy 'end-of-list))))) (readtable:act table (port 'read) acc port))))) ;;; Generic reader loop for a list. It takes as input the table that has ;;; already been updated with end of list and improper list handlers. (define readtable:read-list-loop (lambda (table port) (let ((acc (linked-list:new))) (letrec ((loop (lambda () (let ((value (readtable:act table (port 'read) acc port))) (cond ((eqv? value 'end-of-list) (acc 'to-list)) (else (acc 'push-tail value) (loop))))))) (loop))))) ;;; Readtable for a list, generic to proper and improper list ;;; readers. (define readtable:table-for-list (lambda (oldtable on-dot) (readtable:process oldtable (list readtable:update %eol (readtable:return 'end-of-list)) (list readtable:update #\. (lambda (table char acc port) (let ((entire-identifier (readtable:read-ident table char #f port))) (if (equal? entire-identifier ".") (on-dot table acc port) entire-identifier))))))) ;;; Read a proper or improper list. (define readtable:read-list (lambda (oldtable _ __ port) (readtable:read-list-loop (readtable:table-for-list oldtable readtable:read-improper-cdr) port))) ;;; Read strictly a proper list. This assumes that BOL has been read. (define readtable:read-proper-list (lambda (table port) (readtable:read-list-loop (readtable:table-for-list table (readtable:error 'read-proper-list "expected proper list")) port))) (define readtable:expect-proper-list (lambda (table port) (if (not (eqv? (port 'read) %bol)) (error 'expect-port-list 'no-list-found) (readtable:read-proper-list table port)))) ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Reader for stuff that start with "#" ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define readtable:digits (list #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)) (define readtable:vector (lambda (_ __ toplevel port) (list 'vector (readtable:read-proper-list toplevel port)))) ;;; Block comment reader. ;;; ;;; The outermost block comment reader is passed the toplevel reader as ;;; ACC. When the outermost block is finished, it will tail-call ACC. ;;; (It is basically the continuation of the reader.) ;;; ;;; When a nested block comment is found, it is passed #F as ACC, which ;;; it will not call. It will return an unspecified value. ;;; ;;; Since the read tables are not procedures, references to other tables ;;; in the same LETREC declaration must be protected with explicit LAMBDAs. ;;; Macros could make this much easier to read. (define readtable:block-comment (letrec ((potential-end (readtable:process (readtable:empty/default (lambda (this char acc port) (readtable:act loop char acc port))) (list readtable:update #\# (lambda (this char acc port) (if acc (readtable:next acc #f port)))))) (potential-start (readtable:process (readtable:empty/default (lambda (this char acc port) (readtable:act loop char acc port))) (list readtable:update #\| (lambda (this char acc port) (readtable:next loop #f port) (readtable:next loop acc port))))) (loop (readtable:process (readtable:empty/default readtable:skip) (list readtable:update #\# (lambda (this char acc port) (readtable:next potential-start acc port))) (list readtable:update #\| (lambda (this char acc port) (readtable:next potential-end acc port)))))) loop)) (define read:datum-label (lambda (location toplevel) (let ((finalized? #f) (value '()) (name (read:ident-builder location))) (lambda (op . args) (cond ((eq? op 'finalize-value) (if finalized? (error 'datum-label "cannot finalize twice")) (set! finalized? #t) (set! value (car args)) (set! toplevel #f)) ((eq? op 'finalize->ident) (set! name (name 'finalize->ident))) ((eq? op 'value) (if (not finalized?) (error 'datum-label "value called before finalize")) value) ((eq? op 'toplevel) toplevel) ((eq? op 'type) 'datum-label) ((eq? op 'as-string) (name 'value)) (else (apply name op args))))))) ;;; Readtable for the number part of a datum label / reference. A label ;;; looks like "#[NUMBER]=" and a reference looks like "#[NUMBER]#". ;;; ;;; Datum label assignments #XXX=[DATUM] return DATUM. ;;; Datum label references #XXX# return a DATUM-LABEL object, because ;;; circular references must be resolved later. (With some trickery it ;;; cold be resolved as soon as possible, but that requires knowledge of ;;; what is complex data and what is an atom. This requires knowing all ;;; possible values that the toplevel could read, which is not possible ;;; because new items can be added to the toplevel.) (define readtable:datum-label-next (readtable:process (readtable:empty/default (readtable:error 'datum-label-next "invalid datum label/ref")) (list readtable:update-list readtable:digits readtable:push-char) (list readtable:update #\= (lambda (_ __ acc port) (acc 'finalize->ident) (port 'set-datum-label! (acc 'as-string) acc) (let ((next-value (readtable:next (acc 'toplevel) #f port))) (if (eqv? acc next-value) (error 'datum-label-next "datum label cannot be itself")) (acc 'finalize-value next-value) (acc 'value)))) (list readtable:update #\# (lambda (_ __ acc port) (acc 'finalize->ident) (let ((datum-label-container (port 'get-datum-label (acc 'as-string)))) (if (null? datum-label-container) (error 'datum-label-next "unknown reference to datum label" label) (map:val datum-label-container))))))) ;;; Reads the next toplevel datum, discards it, and then continues at the ;;; toplevel. ;;; ;;; The R7RS reader can cause side-effects due to #!FOLD-CASE and datum ;;; labels. This must be supressed in datum comments. (define readtable:datum-comment (lambda (_ __ toplevel port) (let ((mutable (port 'dump-mutable))) (readtable:next toplevel #f port) (port 'restore-mutable! mutable)) (readtable:next toplevel #f port))) ;;; String map from constants to procedures with formal arguments ;;; (TOPLEVEL PORT) ;;; with TOPLEVEL being the current toplevel table and PORT being the ;;; current port being read from. ;;; ;;; Each one must return something. Directives that return nothing must ;;; call the toplevel again. (define read:hash-constants (smap:insert-many '() (cons "true" (lambda unused #t)) (cons "false" (lambda unused #f)) (cons "t" (lambda unused #t)) (cons "f" (lambda unused #f)) (cons "u8" (lambda (toplevel port) (list 'bytevector (readtable:expect-proper-list toplevel port)))) (cons "!fold-case" (lambda (toplevel port) (port 'fold-case! #t) (readtable:act toplevel (port 'read) #f port))) (cons "!no-fold-case" (lambda (toplevel port) (port 'fold-case! #f) (readtable:act toplevel (port 'read) #f port))))) (define read:read-hash-constants (lambda (_ char acc port) (let ((ident (readtable:read-ident #f char #f port))) (let ((container (smap:search read:hash-constants (ident 'value)))) (if (null? container) (error 'read-hash-constants 'unknown-constant (ident 'value)) ((map:val container) acc port)))))) (define readtable:hash (readtable:process (readtable:empty/default read:read-hash-constants) (list readtable:update #\| (readtable:jump/next readtable:block-comment)) (list readtable:update #\; readtable:datum-comment) (list readtable:update-list readtable:digits ; Datum labels (lambda (_ char toplevel port) (readtable:act readtable:datum-label-next char (read:datum-label (port 'location) toplevel) port))) (list readtable:update %bol readtable:vector))) ;;; ;;;;;;;;;;;;;;;; ;;; Toplevel reader. ;;; ;;;;;;;;;;;;;;;; ;;; This is defined as a function so that it dynamically loads each ;;; sub-readtable. (define readtable:top (lambda () (readtable:process (readtable:empty/default readtable:read-ident) (list readtable:update-list readtable:ASCII-whitespace readtable:skip) (list readtable:update #f (readtable:return 'eof)) (list readtable:update %bol readtable:read-list) (list readtable:update %eol (readtable:error 'top "unbalanced list")) (list readtable:update #\# (readtable:next/old-as-acc readtable:hash)) (list readtable:update #\; (readtable:jump-discard readtable:read-to-newline))))) (define read/toplevel (lambda (port) ;; Does each invocation of READ keep datum labels from previous reads? ;; (port 'clear-datum-labels!) (readtable:next (readtable:top) #f port))) (define intset:insert (set:insert (set:update integer<=>))) (define intset:in (set:in integer<=>)) (define uncycle (lambda (value) (let ((cntr 0) (used-counters '()) (pointers '())) (letrec ((uncycle (lambda (value) (cond ((pair? value) (let ((pair (assq value pointers))) (if (pair? pair) (begin (set! used-counters (car (intset:insert used-counters (cdr pair)))) (list 'ref (cdr pair))) (begin (set! pointers (cons (cons value cntr) pointers)) (let ((cur-cntr cntr)) (set! cntr (+ 1 cntr)) (let ((returned (cons (uncycle (car value)) (uncycle (cdr value))))) (if (not (null? (intset:in used-counters cur-cntr))) (list 'def cur-cntr '= returned) returned))))))) ((procedure? value) (let ((type (value 'type))) (cond ((eq? type 'ident) (value 'value)) ((eq? type 'datum-label) (uncycle (value 'value))) (else (vector 'unrepresentable type))))) (else value))))) (uncycle value))))) ;;; ;;;;;;;;;;; ;;; Test reader ;;; ;;;;;;;;;;; (define %list->read (lambda (seq) (port->read (lambda () (if (null? seq) #f (let ((ch (car seq))) (set! seq (cdr seq)) ch))) "test"))) (define read-all (lambda (str) (let ((reader (%list->read (string->list str)))) (letrec ((loop (lambda () (if (not (reader 'peek)) #t (let ((value (read/toplevel reader))) (display (list "return" (uncycle value))) (newline) (loop)))))) (loop))))) (read-all "x yy zz ; this is a comment\nx call/cc ") (read-all "(a b c def (ghi j) k )") (read-all "( a . b )") (read-all "( a .b . c)") (read-all "#( a b y)") (read-all "(x y #| this is a block\n comment\n |# z w)") (read-all "#( a b #| this is a #| nested block |# comment|# z w)") (read-all "#(a b #(c #|close#|comment|#|#y))") (read-all "(this has a #;(call with (current continuation)) datum comment)") (read-all "#0=(#0# not unix)") (read-all "#!no-fold-case #!fold-case #!NO-FOLD-CASE #false") (read-all "#!fold-case #TRUE") (read-all "#u8(x y z w)")