UNSLISP/read.scm

797 lines
29 KiB
Scheme

;;; Copyright (C) Peter McGoron 2024
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License as published by
;;; the Free Software Foundation, version 3 of the License.
;;;
;;; This program is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program. If not, see <https://www.gnu.org/licenses/>.
;;; R7RS reader. This is the lexer-parser end, so it returns tokens and
;;; not concrete objects.
;;;
;;; Notes:
;;;
;;; Port stores datum labels. Datum labels are stored for the entirety of
;;; a READ: this is to emulate MIT Scheme, which allows for datum labels
;;; outside of the datum that the label appears in.
;;;
;;; The reader does not return Scheme data: it returns annotated data
;;; containing the source location, datum label number, resolved datum
;;; label pointer. This is for advanced syntax systems.
;;;
;;; How datum labels could work:
;;;
;;; When encountering #[number]=, allocate a datum label and assign it
;;; nothing. Then call READ after "=", and destructively update the
;;; datum label with the resulting datum. A pass over the new read
;;; structure to convert it to regular Scheme data will resolve the
;;; indirection.
;;;
;;; All tokens are procedure-encapsulated objects, since the reader should
;;; never return a literal procedure. Each procedure has a TYPE message.
(load "chez-compat.scm")
(load "util.scm")
(load "set.scm")
(load "linked-list.scm")
;;; My text editor cannot parse Scheme's character syntax.
(define %bol #\()
(define %eol #\))
;;; ;;;;;;;;;;;;;;;;;;;;;;;;
;;; Port reader wrapper
;;; ;;;;;;;;;;;;;;;;;;;;;;;;
(define port->read-function
(lambda (port)
(lambda ()
(read-char port))))
;;; READ:
;;;
;;; Stream readers contain mutable state. This is the case-folding mode
;;; and the current list of datum labels.
;;;
;;; (POS): Return (LIST FILENAME LINE-NUMBER OFFSET).
;;; (READ): Read the next character in the stream. Returns #F on EOF.
;;; (PUSH CHAR): Push CHAR such that it will be the next character read
;;; when (READ) is called.
;;; (PEEK): Read character, push it back, and return it.
;;; (FOLD-CASE?): Returns a boolean if case folding is enabled.
;;; (FOLD-CASE! BOOL): Sets the case folding to BOOL.
(define port->read
(lambda (read-function filename)
(let ((line-number 1)
(offset 0)
(pushback-buffer '())
(datum-labels '())
(fold-case? #f))
(letrec ((update-position!
(lambda (ch)
(cond
((eqv? ch #\newline)
(set! line-number (+ 1 line-number)) (set! offset 0))
;; OFFSET is sometimes set to #F to denote an unknown
;; offset.
(offset (set! offset (+ 1 offset))))))
(location
(lambda () (list filename line-number offset)))
(set-datum-label!
(lambda (label value)
(set! datum-labels
(car (smap:insert datum-labels label value)))))
(get-datum-label
(lambda (label)
(smap:search datum-labels label)))
(clear-datum-labels!
(lambda ()
(set! datum-labels '())))
(dump-mutable
(lambda ()
(list datum-labels fold-case?)))
(restore-mutable!
(lambda (state)
(set! datum-labels (car state))
(set! fold-case? (cadr state))))
(process
(lambda (ch)
(update-position! ch)
(cond
((or (eof-object? ch) (not ch)) ch)
(fold-case? (char-downcase ch))
(else ch))))
(port
(lambda (op . args)
;; TODO: turn into string map?
(cond
((eq? op 'location) (location))
((eq? op 'read)
(process
(if (null? pushback-buffer)
(read-function)
(let ((ch (car pushback-buffer)))
(set! pushback-buffer (cdr pushback-buffer))
ch))))
((eq? op 'peek)
(let ((ch (port 'read)))
(port 'push ch)
ch))
((eq? op 'push)
(let ((ch (car args)))
(if (eqv? ch #\newline)
(begin
(set! line-number (- line-number 1))
(set! offset #f))
(set! offset (- offset 1)))
(set! pushback-buffer (cons ch pushback-buffer))))
((eq? op 'fold-case?) fold-case?)
((eq? op 'fold-case!) (set! fold-case? (car args)))
((eq? op 'set-datum-label!) (apply set-datum-label! args))
((eq? op 'get-datum-label) (apply get-datum-label args))
((eq? op 'clear-datum-labels!) (apply clear-datum-labels! args))
((eq? op 'dump-mutable) (apply dump-mutable args))
((eq? op 'restore-mutable!) (apply restore-mutable! args))
(else (error 'read->port 'invalid (cons op args)))))))
port))))
;;; ;;;;;;;;;;;;;;
;;; Character maps
;;; ;;;;;;;;;;;;;;
(define integer<=>
(lambda (x y)
(cond
((< x y) '<)
((= x y) '=)
(else '>))))
;;; Comparison on characters extended to #F, which is less than all
;;; characters.
(define char*<=>
(lambda (x y)
(cond
((and (not x) y) '<)
((and x (not y)) '>)
((and (not x) (not y) '=))
(else (integer<=> (char->integer x)
(char->integer y))))))
(define %charmap:<=> (set:<=>-to-map char*<=>))
(define %charmap:update (set:update %charmap:<=>))
(define charmap:update (map:update %charmap:update))
(define charmap:insert (map:insert %charmap:update))
(define charmap:search (map:search %charmap:<=>))
;;; ;;;;;;;;;;;;;;;;;;;;;;
;;; Readtable constructors
;;;
;;; Readtables are composed of a CHARMAP, which is a map from characters
;;; to actions, and a DEFAULT-ACTION, which is taken when there is no
;;; match in CHARMAP.
;;;
;;; An "action" is a procedure that takes four arguments:
;;;
;;; TABLE: The current table.
;;; CHAR: The character that was matched against the CHARMAP in TABLE.
;;; ACC: An arbitrary "accumulator" value that is different depending
;;; on the readtable in question.
;;; PORT: A port reader object.
;;; ;;;;;;;;;;;;;;;;;;;;;;
;;; (READTABLE:NEW DEFAULT-ACTION CHARMAP)
(define readtable:new cons)
(define %readtable:default-action car)
(define %readtable:charmap cdr)
;;; Run the action in TABLE assigned to CHAR, or the default action of
;;; TABLE if there is no entry for CHAR.
(define readtable:act
(lambda (table char acc port)
(let ((node (charmap:search (%readtable:charmap table)
char)))
(let ((action (if (null? node)
(%readtable:default-action table)
(map:val node))))
(action table char acc port)))))
;;; Run the action in TABLE with the next character from PORT.
(define readtable:next
(lambda (table acc port)
(readtable:act table (port 'read) acc port)))
;;; Return a new readtable where CHAR is bound to ACTION.
(define readtable:update
(lambda (table char action)
(readtable:new (%readtable:default-action table)
(car (charmap:insert
(%readtable:charmap table) char action)))))
;;; Update TABLE to act on all characters in LST with ACTION.
(define readtable:update-list
(lambda (table lst action)
(fold (lambda (char table)
(readtable:update table char action))
table
lst)))
;;; Construct new readtable with no characters in its map and
;;; DEFAULT-ACTION as the default action.
(define readtable:empty/default
(lambda (default-action)
(readtable:new default-action '())))
;;; Each value in FUNCTIONS is a list (PROCEDURE ARGS...) which is called
;;; like (PROCEDURE TABLE ARGS...) and returns a table.
(define readtable:process
(lambda (table . functions)
(fold (lambda (function table)
(apply (car function) table (cdr function)))
table
functions)))
;;; ;;;;;;;;;;;;;;;;;;
;;; Default actions
;;; ;;;;;;;;;;;;;;;;;;
;;; Return an error.
(define readtable:error
(lambda emsg
(lambda (table char acc port)
(error emsg char acc table port))))
;;; Discard the current character and continue reading the readtable.
(define readtable:skip
(lambda (table char acc port)
(readtable:act table (port 'read) acc port)))
;;; Discard char and return constant.
(define readtable:return
(lambda (return)
(lambda (table char acc port)
return)))
;;; Jump to a new readtable, discard it's return, and continue reading
;;; in the table.
(define readtable:jump-discard
(lambda (newtable)
(lambda (oldtable char acc port)
(readtable:act newtable (port 'read) '() port)
(readtable:act oldtable (port 'read) acc port))))
;;; Jump to a new readtable with the same characters.
(define readtable:jump
(lambda (newtable)
(lambda (oldtable char acc port)
(readtable:act newtable char acc port))))
;;; Jump to a new readtable, reading the new character, with the old
;;; readtable as ACC.
(define readtable:next/old-as-acc
(lambda (newtable)
(lambda (oldtable __ _ port)
(readtable:next newtable oldtable port))))
;;; Jump to a new readtable, reading the new character.
(define readtable:jump/next
(lambda (newtable)
(lambda (oldtable _ acc port)
(readtable:next newtable acc port))))
;;; ;;;;;;;;;;;;;;;;;
;;; Identifier reader
;;; ;;;;;;;;;;;;;;;;;
(define read:ident
(lambda (name location)
(lambda (op . args)
(cond
((eq? op 'type) 'ident)
((eq? op 'value) name)
(else (error 'read:ident "invalid operation" op args))))))
(define read:ident-builder
(lambda (location . start-char)
(let ((char-list (linked-list:new)))
(if (not (null? start-char))
(char-list 'push (car start-char)))
(lambda (op . args)
(cond
((eq? op 'finalize->ident)
(read:ident (list->string (char-list 'to-list)) location))
(else (apply char-list op args)))))))
;;; Push back CHAR and return ACC.
(define readtable:return-acc-as-ident
(lambda (table char acc port)
(port 'push char)
(acc 'finalize->ident)))
;;; Push CHAR to ACC and continue reading from TABLE.
(define readtable:push-char
(lambda (table char acc port)
(acc 'push-tail char)
(readtable:act table (port 'read) acc port)))
;;; Define a readtable that constructs an identifier by accepting all
;;; characters that are not listed.
(define readtable:exclude-from-identifiers
(lambda (table excluded)
(fold (lambda (char table)
(readtable:update table char readtable:return-acc-as-ident))
table
excluded)))
;;; ASCII whitespace.
(define readtable:ASCII-whitespace
(list #\newline
#\space
(integer->char #x09)
(integer->char #x0B)
(integer->char #x0C)
(integer->char #x0D)))
;;; Readtable for identifiers.
(define readtable:identifier
(readtable:process
(readtable:empty/default readtable:push-char)
(list readtable:exclude-from-identifiers
readtable:ASCII-whitespace)
(list readtable:exclude-from-identifiers
(list #\| %bol %eol #\' #\; #f))))
;;; Read an identifier starting with CHAR.
(define readtable:read-ident
(lambda (table char acc port)
(readtable:act readtable:identifier
(port 'read)
(read:ident-builder (port 'location)
char)
port)))
;;; ;;;;;;;;;;;;;;;;;;;;
;;; Comments and whitespace reader
;;; ;;;;;;;;;;;;;;;;;;;;
;;; Readtable for a line comment.
(define readtable:read-to-newline
(readtable:process
(readtable:empty/default readtable:skip)
(list readtable:update #\newline (readtable:return #f))))
;;; ;;;;;;;;;;;
;;; List reader
;;;
;;; The reader updates the previous readtable to handle ). This means
;;; that this read table does not have to handle end-of-line, whitespace,
;;; etc.
;;; ;;;;;;;;;;;
;;; Read the end of an improper list.
(define readtable:read-improper-cdr
(lambda (table acc port)
(let ((val
(readtable:act (readtable:update table
%eol
(readtable:error
'read-improper-cdr
"proper list must have cdr"))
(port 'read)
#f
port)))
(acc 'set-cdr! val)
(let ((table (readtable:process
(readtable:empty/default (readtable:error
'read-improper-cdr
"improper list has 1 cdr"))
(list readtable:update-list
readtable:ASCII-whitespace
readtable:skip)
(list readtable:update %eol
(lambda dummy 'end-of-list)))))
(readtable:act table (port 'read) acc port)))))
;;; Generic reader loop for a list. It takes as input the table that has
;;; already been updated with end of list and improper list handlers.
(define readtable:read-list-loop
(lambda (table port)
(let ((acc (linked-list:new)))
(letrec ((loop
(lambda ()
(let ((value (readtable:act table
(port 'read)
acc
port)))
(cond
((eqv? value 'end-of-list) (acc 'to-list))
(else (acc 'push-tail value)
(loop)))))))
(loop)))))
;;; Readtable for a list, generic to proper and improper list
;;; readers.
(define readtable:table-for-list
(lambda (oldtable on-dot)
(readtable:process
oldtable
(list readtable:update %eol (readtable:return 'end-of-list))
(list readtable:update #\.
(lambda (table char acc port)
(let ((entire-identifier (readtable:read-ident
table
char
#f
port)))
(if (equal? entire-identifier ".")
(on-dot table acc port)
entire-identifier)))))))
;;; Read a proper or improper list.
(define readtable:read-list
(lambda (oldtable _ __ port)
(readtable:read-list-loop (readtable:table-for-list
oldtable
readtable:read-improper-cdr)
port)))
;;; Read strictly a proper list. This assumes that BOL has been read.
(define readtable:read-proper-list
(lambda (table port)
(readtable:read-list-loop (readtable:table-for-list
table
(readtable:error
'read-proper-list
"expected proper list"))
port)))
(define readtable:expect-proper-list
(lambda (table port)
(if (not (eqv? (port 'read) %bol))
(error 'expect-port-list 'no-list-found)
(readtable:read-proper-list table port))))
;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Reader for stuff that start with "#"
;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define readtable:digits
(list #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))
(define readtable:vector
(lambda (_ __ toplevel port)
(list 'vector (readtable:read-proper-list toplevel port))))
;;; Block comment reader.
;;;
;;; The outermost block comment reader is passed the toplevel reader as
;;; ACC. When the outermost block is finished, it will tail-call ACC.
;;; (It is basically the continuation of the reader.)
;;;
;;; When a nested block comment is found, it is passed #F as ACC, which
;;; it will not call. It will return an unspecified value.
;;;
;;; Since the read tables are not procedures, references to other tables
;;; in the same LETREC declaration must be protected with explicit LAMBDAs.
;;; Macros could make this much easier to read.
(define readtable:block-comment
(letrec ((potential-end
(readtable:process
(readtable:empty/default
(lambda (this char acc port) (readtable:act
loop
char
acc
port)))
(list readtable:update #\#
(lambda (this char acc port)
(if acc
(readtable:next acc #f port))))))
(potential-start
(readtable:process
(readtable:empty/default
(lambda (this char acc port) (readtable:act
loop
char
acc
port)))
(list readtable:update #\|
(lambda (this char acc port)
(readtable:next loop #f port)
(readtable:next loop acc port)))))
(loop
(readtable:process
(readtable:empty/default readtable:skip)
(list readtable:update #\#
(lambda (this char acc port)
(readtable:next potential-start
acc
port)))
(list readtable:update #\|
(lambda (this char acc port)
(readtable:next potential-end
acc
port))))))
loop))
(define read:datum-label
(lambda (location toplevel)
(let ((finalized? #f)
(value '())
(name (read:ident-builder location)))
(lambda (op . args)
(cond
((eq? op 'finalize-value)
(if finalized?
(error 'datum-label "cannot finalize twice"))
(set! finalized? #t)
(set! value (car args))
(set! toplevel #f))
((eq? op 'finalize->ident)
(set! name (name 'finalize->ident)))
((eq? op 'value)
(if (not finalized?)
(error 'datum-label "value called before finalize"))
value)
((eq? op 'toplevel) toplevel)
((eq? op 'type) 'datum-label)
((eq? op 'as-string) (name 'value))
(else (apply name op args)))))))
;;; Readtable for the number part of a datum label / reference. A label
;;; looks like "#[NUMBER]=" and a reference looks like "#[NUMBER]#".
;;;
;;; Datum label assignments #XXX=[DATUM] return DATUM.
;;; Datum label references #XXX# return a DATUM-LABEL object, because
;;; circular references must be resolved later. (With some trickery it
;;; cold be resolved as soon as possible, but that requires knowledge of
;;; what is complex data and what is an atom. This requires knowing all
;;; possible values that the toplevel could read, which is not possible
;;; because new items can be added to the toplevel.)
(define readtable:datum-label-next
(readtable:process
(readtable:empty/default (readtable:error 'datum-label-next
"invalid datum label/ref"))
(list readtable:update-list readtable:digits readtable:push-char)
(list readtable:update #\=
(lambda (_ __ acc port)
(acc 'finalize->ident)
(port 'set-datum-label! (acc 'as-string) acc)
(let ((next-value (readtable:next (acc 'toplevel)
#f
port)))
(if (eqv? acc next-value)
(error 'datum-label-next "datum label cannot be itself"))
(acc 'finalize-value next-value)
(acc 'value))))
(list readtable:update #\#
(lambda (_ __ acc port)
(acc 'finalize->ident)
(let ((datum-label-container (port 'get-datum-label
(acc 'as-string))))
(if (null? datum-label-container)
(error 'datum-label-next
"unknown reference to datum label" label)
(map:val datum-label-container)))))))
;;; Reads the next toplevel datum, discards it, and then continues at the
;;; toplevel.
;;;
;;; The R7RS reader can cause side-effects due to #!FOLD-CASE and datum
;;; labels. This must be supressed in datum comments.
(define readtable:datum-comment
(lambda (_ __ toplevel port)
(let ((mutable (port 'dump-mutable)))
(readtable:next toplevel #f port)
(port 'restore-mutable! mutable))
(readtable:next toplevel #f port)))
;;; ;;;;;;;;;;;;;;;;;;;;;;;;
;;; Handling hash messages
;;;
;;; A "hash message" is any identifier that immediately follows a "#".
;;; This can be a constant, a part of a constant (like for numbers), or
;;; a directive (like #!FOLD-CASE).
;;; String map from constants to procedures with formal arguments
;;; (TOPLEVEL PORT)
;;; with TOPLEVEL being the current toplevel table and PORT being the
;;; current port being read from.
;;;
;;; Each one must return something. Directives that return nothing must
;;; call the toplevel again.
(define read:hash-messages
(smap:insert-many
'()
(cons "true" (lambda unused #t))
(cons "false" (lambda unused #f))
(cons "t" (lambda unused #t))
(cons "f" (lambda unused #f))
(cons "u8" (lambda (toplevel port)
(list 'bytevector
(readtable:expect-proper-list toplevel port))))
(cons "!fold-case"
(lambda (toplevel port)
(port 'fold-case! #t)
(readtable:act toplevel (port 'read) #f port)))
(cons "!no-fold-case"
(lambda (toplevel port)
(port 'fold-case! #f)
(readtable:act toplevel (port 'read) #f port)))))
(define read:read-hash-messages
(lambda (_ char acc port)
(let ((ident (readtable:read-ident #f char #f port)))
(let ((container (smap:search read:hash-messages
(ident 'value))))
;; TODO: check if the hash message starts a number.
(if (null? container)
(error 'read-hash-messages 'unknown-constant
(ident 'value))
((map:val container) acc port))))))
;;; ;;;;;;;;;;;;;;;;;;;;;;
;;; Characters
;;;
;;; For completeness, this adds a few more newline characters that are C
;;; escape sequences but are not standard Scheme.
(define read:named-characters
(smap:insert-many
'()
(cons "null" (integer->char 0))
(cons "alarm" (integer->char #x7))
(cons "backspace" (integer->char #x8))
(cons "tab" (integer->char #x9))
(cons "newline" (integer->char #xA))
(cons "vertical-tab" (integer->char #xB))
(cons "form-feed" (integer->char #xC))
(cons "return" (integer->char #xD))
(cons "escape" (integer->char #x1B))
(cons "space" (integer->char #x20))
(cons "delete" (integer->char #x7F))))
(define readtable:character
(lambda (_ char __ port)
(let ((char (port 'read)))
(let ((ident (readtable:read-ident #f char #f port)))
(let ((container (smap:search read:named-characters
(ident 'value))))
(display (ident 'value))
(newline)
(cond
((and (null? container)
(= (string-length (ident 'value)) 1))
(string-ref (ident 'value) 0))
((null? container) (error 'character 'unknown-character ident))
(else (map:val container))))))))
(define readtable:hash
(readtable:process
(readtable:empty/default read:read-hash-messages)
(list readtable:update-list readtable:ASCII-whitespace
(readtable:error 'hash 'whitespace-after-hash))
(list readtable:update #\| (readtable:jump/next readtable:block-comment))
(list readtable:update #\; readtable:datum-comment)
(list readtable:update #\\ readtable:character)
(list readtable:update-list readtable:digits ; Datum labels
(lambda (_ char toplevel port)
(readtable:act readtable:datum-label-next
char
(read:datum-label (port 'location) toplevel)
port)))
(list readtable:update %bol readtable:vector)))
;;; ;;;;;;;;;;;;;;;;
;;; Toplevel reader.
;;; ;;;;;;;;;;;;;;;;
;;; This is defined as a function so that it dynamically loads each
;;; sub-readtable.
(define readtable:top
(lambda ()
(readtable:process
(readtable:empty/default readtable:read-ident)
(list readtable:update-list readtable:ASCII-whitespace readtable:skip)
(list readtable:update #f (readtable:return 'eof))
(list readtable:update %bol readtable:read-list)
(list readtable:update %eol (readtable:error 'top "unbalanced list"))
(list readtable:update #\# (readtable:next/old-as-acc
readtable:hash))
(list readtable:update #\;
(readtable:jump-discard readtable:read-to-newline)))))
(define read/toplevel
(lambda (port)
;; Does each invocation of READ keep datum labels from previous reads?
;; (port 'clear-datum-labels!)
(readtable:next (readtable:top)
#f
port)))
(define intset:insert (set:insert (set:update integer<=>)))
(define intset:in (set:in integer<=>))
(define uncycle
(lambda (value)
(let ((cntr 0)
(used-counters '())
(pointers '()))
(letrec ((uncycle
(lambda (value)
(cond
((pair? value)
(let ((pair (assq value pointers)))
(if (pair? pair)
(begin
(set! used-counters
(car (intset:insert used-counters (cdr pair))))
(list 'ref (cdr pair)))
(begin
(set! pointers (cons (cons value cntr)
pointers))
(let ((cur-cntr cntr))
(set! cntr (+ 1 cntr))
(let ((returned (cons (uncycle (car value))
(uncycle (cdr value)))))
(if (not (null? (intset:in used-counters cur-cntr)))
(list 'def cur-cntr '= returned)
returned)))))))
((procedure? value)
(let ((type (value 'type)))
(cond
((eq? type 'ident) (value 'value))
((eq? type 'datum-label) (uncycle (value 'value)))
(else (vector 'unrepresentable type)))))
(else value)))))
(uncycle value)))))
;;; ;;;;;;;;;;;
;;; Test reader
;;; ;;;;;;;;;;;
(define %list->read
(lambda (seq)
(port->read
(lambda ()
(if (null? seq)
#f
(let ((ch (car seq)))
(set! seq (cdr seq))
ch)))
"test")))
(define read-all
(lambda (str)
(let ((reader (%list->read (string->list str))))
(letrec ((loop
(lambda ()
(if (not (reader 'peek))
#t
(let ((value (read/toplevel reader)))
(display (list "return" (uncycle value)))
(newline)
(loop))))))
(loop)))))
(read-all "x yy zz ; this is a comment\nx call/cc ")
(read-all "(a b c def (ghi j) k )")
(read-all "( a . b )")
(read-all "( a .b . c)")
(read-all "#( a b y)")
(read-all "(x y #| this is a block\n comment\n |# z w)")
(read-all "#( a b #| this is a #| nested block |# comment|# z w)")
(read-all "#(a b #(c #|close#|comment|#|#y))")
(read-all "(this has a #;(call with (current continuation)) datum comment)")
(read-all "#0=(#0# not unix)")
(read-all "#!no-fold-case #!fold-case #!NO-FOLD-CASE #false")
(read-all "#!fold-case #TRUE")
(read-all "#u8(x y z w)")
(read-all "#\\newline")
(read-all "#\\a")