diff options
| -rw-r--r-- | Makefile | 4 | ||||
| -rw-r--r-- | mcgoron.ris.data.scm | 166 | ||||
| -rw-r--r-- | mcgoron.ris.data.sld | 21 | ||||
| -rw-r--r-- | mcgoron.ris.parse.scm | 115 | ||||
| -rw-r--r-- | mcgoron.ris.parse.sld | 2 | ||||
| -rw-r--r-- | sris.egg | 9 | ||||
| -rw-r--r-- | tests/run.scm | 3 |
7 files changed, 280 insertions, 40 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..dcaa6e7 --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +.POSIX: + +clean: + rm *.build.sh *.install.sh *.import.scm *.so *.link *.o diff --git a/mcgoron.ris.data.scm b/mcgoron.ris.data.scm new file mode 100644 index 0000000..83af524 --- /dev/null +++ b/mcgoron.ris.data.scm @@ -0,0 +1,166 @@ +#| Copyright 2024 Peter McGoron + | + | Licensed under the Apache License, Version 2.0 (the "License"); + | + | you may not use this file except in compliance with the License. + | You may obtain a copy of the License at + | + | http://www.apache.org/licenses/LICENSE-2.0 + | + | Unless required by applicable law or agreed to in writing, software + | distributed under the License is distributed on an "AS IS" BASIS, + | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + | See the License for the specific language governing permissions and + | limitations under the License. + |------------------------------------------------------------------------| + | This file implements mapping from RIS codes to English descriptions. + | The sets and maps are implemented as parameters so they can be changed + | for any non-standard RIS files. + |# + +;;; Utility functions. + +(define (make-mapping-parameter default) + (make-parameter + default + (lambda (x) + (cond + ((mapping? x) x) + ((pair? x) (alist->mapping (make-default-comparator) x)) + (else (error "parameter must be a map" x)))))) + +(define (make-set-parameter default) + (make-parameter + default + (lambda (x) + (cond + ((set? x) x) + ((pair? x) (list->set (make-default-comparator) x)) + (else (error "parameter must be a set" x)))))) + +(define (parameter-lookup param) + (lambda (x) + (mapping-ref/default (param) x #f))) + +;;; Standard author fields. +;;; There are 4 author fields, but some information about the authors +;;; can be interleaved in with the author fields (for instance, "AD", +;;; author address). +;;; +;;; To work around this bad design choice, all authors are put into a +;;; meta-field "AUTHORS" which deliberately does not conform to the RIS +;;; format. +;;; +;;; This should have been a nested format like sexprs or JSON. At least +;;; it's not XML. + +(define author-fields + (make-set-parameter + '("AU" "A2" "A3" "A4"))) + +(define (author-field? x) + (set-contains? (author-fields) x)) + +;;; Maps from standard type and field codes to their descriptions. + +(define type-codes + (make-mapping-parameter + '(("GEN" . "Generic") + ("ART" . "Artwork") + ("ABST" . "Abstract") + ("AGGR" . "Aggregated Database") + ("ANCIENT" . "Ancient Text") + ("ADVS" . "Audiovisual Material") + ("BILL" . "Bill") + ("BLOG" . "Blog") + ("BOOK" . "Book") + ("CHAP" . "Book Section") + ("CASE" . "Case") + ("CTLG" . "Catalog") + ("CHART" . "Chart") + ("CLSWK" . "Classical Work") + ("COMP" . "Computer Program") + ("CPAPER" . "Conference Paper") + ("CONF" . "Conference Proceeding") + ("DATA" . "Dataset") + ("DICT" . "Dictionary") + ("EDBOOK" . "Edited Book") + ("EBOOK" . "Electronic Book") + ("ECHAP" . "Electronic Book Section") + ("EJOUR" . "Electronic Article") + ("ENCYC" . "Encyclopedia") + ("EQUA" . "Equation") + ("FIGURE" . "Figure") + ("MPCT" . "Film or Broadcast") + ("HEAR" . "Hearing") + ("INPR" . "In Press Article") + ("ICOMM" . "Internet Communication") + ("JOUR" . "Journal Article") + ("LEGAL" . "Legal Rule") + ("MGZN" . "Magazine Article") + ("MANSCPT" . "Manuscript") + ("MAP" . "Map") + ("MUSIC" . "Music") + ("NEWS" . "Newspaper Article") + ("DBASE" . "Online Database") + ("MULTI" . "Online Multimedia") + ("PAMP" . "Pamphlet") + ("PAT" . "Patent") + ("PCOMM" . "Personal Communication") + ("RPRT" . "Report") + ("SER" . "Serial") + ("SLIDE" . "Slide") + ("SOUND" . "Sound Recording") + ("STAND" . "Standard") + ("STAT" . "Statute") + ("THES" . "Thesis") + ("UNBILL" . "Unenacted Bill") + ("UNPD" . "Unpublished Work") + ("VIDEO" . "Video Recording") + ("ELEC" . "Web Page")))) + +(define ris-type-name + (parameter-lookup type-codes)) + +(define field-codes + (make-mapping-parameter + '(("TY" . "GEN") + ("A2" . "Secondary Author") + ("A3" . "Tertiary Author") + ("A4" . "Subsidiary Author") + ("AB" . "Abstract") + ("AD" . "Author Address") + ("AN" . "Accession Number") + ("AU" . "Author") + ("C1" . "Custom 1") + ("C2" . "Custom 2") + ("C3" . "Custom 3") + ("C4" . "Custom 4") + ("C5" . "Custom 5") + ("C6" . "Custom 6") + ("C7" . "Custom 7") + ("C8" . "Custom 8") + ("CA" . "Call Number") + ("CN" . "Caption") + ("CY" . "Place Published") + ("DA" . "Date") + ("DB" . "Name of Database") + ("DO" . "DOI") + ("DP" . "Database Provider") + ("ET" . "Edition") + ("J2" . "Alternate Title") + ("KW" . "Keywords") + ("L1" . "File Attachments") + ("L4" . "Language") + ("LA" . "Figure") + ("LB" . "Label") + ("IS" . "Number") + ("M3" . "Type of Work") + ("N1" . "Notes") + ("NV" . "Number of Volumes") + ("OP" . "Original Publication") + ("PB" . "Publisher") + ("PY" . "Year")))) + +(define ris-field-name + (parameter-lookup field-codes)) diff --git a/mcgoron.ris.data.sld b/mcgoron.ris.data.sld new file mode 100644 index 0000000..4b32d34 --- /dev/null +++ b/mcgoron.ris.data.sld @@ -0,0 +1,21 @@ +#| Copyright 2024 Peter McGoron + | + | Licensed under the Apache License, Version 2.0 (the "License"); + | + | you may not use this file except in compliance with the License. + | You may obtain a copy of the License at + | + | http://www.apache.org/licenses/LICENSE-2.0 + | + | Unless required by applicable law or agreed to in writing, software + | distributed under the License is distributed on an "AS IS" BASIS, + | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + | See the License for the specific language governing permissions and + | limitations under the License. + |# + +(define-library (mcgoron ris data) + (import (scheme base) (srfi 146) (srfi 113) (srfi 128)) + (export author-fields type-codes field-codes + author-field? ris-type-name ris-field-name) + (include "mcgoron.ris.data.scm"))
\ No newline at end of file diff --git a/mcgoron.ris.parse.scm b/mcgoron.ris.parse.scm index 3711a96..2bd1c42 100644 --- a/mcgoron.ris.parse.scm +++ b/mcgoron.ris.parse.scm @@ -17,55 +17,79 @@ ;;; Stream destructors. (define (stream-pair=> stream) + ;; Destructure a stream into its car and cdr. (cond-values (when-ct (stream-pair? stream) (values (stream-car stream) (stream-cdr stream))))) (define (stream-predicate=> stream predicate?) + ;; Destructure a stream into its car and cdr if the car satisfies + ;; PREDICATE? (cond-values (after ((let (stream-pair=> stream) => (head stream)) (when (predicate? head))) (values head stream)))) (define-syntax with-stream-predicates + ;; Sequence stream predicates together. + ;; The final cdr is bound to STREAM. The resultant CAR of each test + ;; is bound to NAME. (syntax-rules () ((_ stream ((predicate? name) ...) body ...) (after ((let (stream-predicate=> stream predicate?) => (name stream)) ...) body ...)))) -;;; Mapping destructors. +;;; Mapping utils. -(define (mapping-ref=> mapping key) - (mapping-ref mapping key (lambda () (values)))) - -(define (mapping-new-field mapping key val) - (mapping-set mapping key (flexvector val))) +(define (add-to-mapping mapping name rest) + ;; Add REST to the flexvector mapped to NAME in MAPPING. If NAME is + ;; not mapped, allocate a new flexvector with REST as its first element. + (let ((fv (mapping-ref/default mapping name #f))) + (cond + (fv (flexvector-add-back! fv rest) + mapping) + (else (mapping-set mapping name (flexvector rest)))))) ;;; Flexvector utils. (define (flexvector-update-last! fv update) + ;; Update the last value in FV with the result of applying UPDATE + ;; to the last value in FV. (let ((len (flexvector-length fv))) (flexvector-set! fv (- len 1) (update (flexvector-ref fv (- len 1)))))) +(define (append-new-line! fv rest) + ;; Append to the last value in FV the string REST. + (flexvector-update-last! fv + (lambda (x) + (string-append x "\n" rest)))) + ;;; Parsing each line. -(define (prefix-char? ch) +(define (field-char? ch) + ;; Check if CH is a valid RIS field char. (or (char-upper-case? ch) (char-numeric? ch))) (define (end=> stream) + ;; Check if STREAM is at the end of a line. (cond-values + ;; EOF is end of line (when-ct (stream-null? stream) stream) + ;; Handle CRLF (with-stream-predicates stream (((cut eqv? #\return <>) _) ((cut eqv? #\newline <>) _)) stream) + ;; Handle LF (with-stream-predicates stream (((cut eqv? #\newline <>) _)) stream))) (define (read-rest stream) + ;; Read the rest of the stream up to the end of a line, and return + ;; it as a string. (let ((fv (flexvector))) (let read-rest ((stream stream)) (cond-thunk @@ -77,12 +101,16 @@ (read-rest stream)) (else (raise (stream-error stream))))))) -(define (prefix=> stream) +(define (field=> stream) + ;; At the start of a line, read a field specifier, or return no + ;; values. + ;; + ;; TODO: This needs to read /[A-Z0-9]{2,} - / (let ((ws? (lambda (x) (char=? x #\space)))) (cond-values - (with-stream-predicates stream ((prefix-char? char1) - (prefix-char? char2) + (with-stream-predicates stream ((field-char? char1) + (field-char? char2) (ws? _) (ws? _) ((cut char=? #\- <>) _) @@ -92,48 +120,63 @@ ;;; Parsing of references. +(define (add/special-fields mapping name rest) + (let ((mapping (add-to-mapping mapping name rest))) + ;; If NAME is an author field, append the name to the special field + ;; AUTHOR. + (if (author-field? name) + (add-to-mapping mapping "AUTHOR" rest) + mapping))) + (define (read-reference stream mapping last-name) + ;; Read an entire reference up to the ER field. This function starts + ;; at the beginning of a line. + ;; + ;; All new fields are added to MAPPING. Repeated fields are appended + ;; to a flexvector that the field name maps to. + ;; + ;; If the beginning of the line is not a field, then it is a + ;; continuation of the last field, specified by LAST-NAME. + ;; + ;; Returns (VALUES STREAM MAPPING) after ER. (cond-thunk - (after ((let (prefix=> stream) => (name rest stream))) - (cond-thunk - (when-ct (equal? name "ER") - (values stream mapping)) - (after ((let (mapping-ref=> mapping name) => (fv))) - (flexvector-add-back! fv rest) - (read-reference stream mapping name)) - (else - (read-reference stream - (mapping-new-field mapping - name - rest) - name)))) + (after ((let (field=> stream) => (name rest stream))) + (if (equal? name "ER") + (values stream mapping) + (let ((mapping (add/special-fields mapping name rest))) + (read-reference stream mapping name)))) (when-ct (stream-null? stream) (values stream mapping)) (else (let-values - (((fv) (mapping-ref mapping - last-name - (lambda () (error "internal error" stream)))) + (((fv) (mapping-ref mapping last-name)) ((rest stream) (read-rest stream))) - (flexvector-update-last! fv - (lambda (last) - (string-append last "\n" rest))) + (append-new-line! rest) (read-reference stream mapping last-name))))) -(define (read-new-reference stream mapping) +(define (new-mapping name rest) + (add-to-mapping (mapping (make-default-comparator)) name rest)) + +(define (read-new-reference stream references-fv) + ;; Read the start of a new reference. + ;; + ;; If the file is at EOF, return MAPPING. + ;; Otherwise, read the TY field of the new reference. If there is no + ;; TY field, raise MALFORMED-REFERENCE-ERROR. (cond-thunk (when-ct (stream-null? stream) - mapping) - (after ((let (prefix=> stream) => (name rest stream)) + references-fv) + (after ((let (field=> stream) => (name rest stream)) (when (string=? name "TY"))) - (let-values (((stream mapping) + (let-values (((stream ref-mapping) (read-reference stream - (mapping-new-field mapping name rest) + (new-mapping name rest) name))) - (read-new-reference stream mapping))) + (flexvector-add-back! references-fv ref-mapping) + (read-new-reference stream references-fv))) (else (raise (malformed-reference-error stream))))) (define (ris->mapping port) (read-new-reference (port->stream port) - (mapping (make-default-comparator)))) + (flexvector))) diff --git a/mcgoron.ris.parse.sld b/mcgoron.ris.parse.sld index aa02ac0..ddc2f85 100644 --- a/mcgoron.ris.parse.sld +++ b/mcgoron.ris.parse.sld @@ -18,6 +18,6 @@ (import (scheme base) (scheme char) (srfi 214) (srfi 128) (srfi 146) (srfi 26) (srfi 41) (mcgoron cond-thunk) (mcgoron cond-thunk values) - (mcgoron ris exceptions)) + (mcgoron ris exceptions) (mcgoron ris data)) (export ris->mapping) (include "mcgoron.ris.parse.scm"))
\ No newline at end of file @@ -3,12 +3,17 @@ (synopsis "Parse RIS files") (category "parsing") (license "Apache-2.0") - (dependencies "r7rs") + (dependencies "r7rs" "srfi-146" "srfi-113" "srfi-214" "srfi-128" + "srfi-146" "srfi-41" "srfi-128") (test-dependencies "test") (components (extension mcgoron.ris.exceptions (source "mcgoron.ris.exceptions.sld") (csc-options "-R" "r7rs" "-X" "r7rs")) + (extension mcgoron.ris.data + (source "mcgoron.ris.data.sld") + (csc-options "-R" "r7rs" "-X" "r7rs")) (extension mcgoron.ris.parse (source "mcgoron.ris.parse.sld") (csc-options "-R" "r7rs" "-X" "r7rs") - (component-dependencies mcgoron.ris.exceptions)))) + (component-dependencies mcgoron.ris.exceptions + mcgoron.ris.data)))) diff --git a/tests/run.scm b/tests/run.scm index 1d80564..f5c95ae 100644 --- a/tests/run.scm +++ b/tests/run.scm @@ -17,7 +17,8 @@ (import r7rs (mcgoron ris parse) test (srfi 146) (srfi 214)) (define file - (call-with-input-file "10.1038_s41467-022-34369-4-citation.ris" ris->mapping)) + (flexvector-front + (call-with-input-file "10.1038_s41467-022-34369-4-citation.ris" ris->mapping))) (test-begin "10.1038_s41467-022-34369-4-citation.ris") (define-syntax test-ref |
