merge from master to elisp

[bpt/guile.git] / module / language / elisp / parser.scm
diff --git a/module/language/elisp/parser.scm b/module/language/elisp/parser.scm

dissimilarity index 68%

index 431eba3..04229d8 100644 (file)
--- a/module/language/elisp/parser.scm
+++ b/module/language/elisp/parser.scm
@@ -1,76 +1,212 @@
-;;; Guile Emac Lisp
-
-;; Copyright (C) 2001 Free Software Foundation, Inc.
-
-;; This program is free software; you can redistribute it and/or modify
-;; it under the terms of the GNU General Public License as published by
-;; the Free Software Foundation; either version 2, or (at your option)
-;; any later version.
-;; 
-;; This program is distributed in the hope that it will be useful,
-;; but WITHOUT ANY WARRANTY; without even the implied warranty of
-;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;; GNU General Public License for more details.
-;; 
-;; You should have received a copy of the GNU General Public License
-;; along with this program; see the file COPYING.  If not, write to
-;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-;; Boston, MA 02111-1307, USA.
-
-;;; Code:
-
-(define-module (language elisp parser)
-  #:use-module (language elisp lexer)
-  #:use-module (language ecmascript parse-lalr)
-  #:export (read-elisp))
-
-; The parser (reader) for elisp expressions.  It is implemented using the
-; (text parse-lalr) parser generator and uses my hand-written lexer as
-; the tokenizer.
-
-
-; Build the parser itself using parse-lalr.
-
-(define elisp-parser
-  (lalr-parser (integer float symbol character string
-                paren-open paren-close square-open square-close
-                dot quote backquote unquote unquote-splicing)
-
-    ; Expressions are our main interest.
-    ; It seems the symbol we're interested for return from the parser must
-    ; come very first, so here it is.
-    (expression (integer) -> $1
-                (float) -> $1
-                (symbol) -> $1
-                (character) -> $1
-                (string) -> $1
-                (list) -> $1
-                (quotation) -> $1
-                (vector) -> $1)
-
-    ; Pairs, lists and dotted lists.
-    (partial-list (expression) -> (list $1)
-                  (expression dot expression) -> (cons $1 $3)
-                  (expression partial-list) -> (cons $1 $2))
-    (list (paren-open paren-close) -> '()
-          (paren-open dot expression paren-close) -> $3
-          (paren-open partial-list paren-close) -> $2)
-
-    ; Quotation and unquotation expressions.
-    (quotation (quote expression) -> `(quote ,$2)
-               (backquote expression) -> `(\` ,$2)
-               (unquote expression) -> `(\, ,$2)
-               (unquote-splicing expression) -> `(\,@ ,$2))
-
-    ; Vectors.
-    (vector-elements (expression) -> (list $1)
-                     (expression vector-elements) -> (cons $1 $2))
-    (vector (square-open square-close) -> (make-vector 0)
-            (square-open vector-elements square-close) -> (list->vector $2))))
-
-
-; Use the parser to define the elisp reader function.
-; We only want to read a single expression at a time, so use get-lexer/1.
-
-(define (read-elisp port)
-  (elisp-parser (get-lexer/1 port) error))
+;;; Guile Emac Lisp
+
+;; Copyright (C) 2009 Free Software Foundation, Inc.
+
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;; 
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;; 
+;; You should have received a copy of the GNU General Public License
+;; along with this program; see the file COPYING.  If not, write to
+;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+;; Boston, MA 02111-1307, USA.
+
+;;; Code:
+
+(define-module (language elisp parser)
+  #:use-module (language elisp lexer)
+  #:export (read-elisp))
+
+; The parser (reader) for elisp expressions.
+; Is is hand-written (just as the lexer is) instead of using some parser
+; generator because this allows easier transfer of source properties from the
+; lexer ((text parse-lalr) seems not to allow access to the original lexer
+; token-pair) and is easy enough anyways.
+
+
+; Report a parse error.  The first argument is some current lexer token
+; where source information is available should it be useful.
+
+(define (parse-error token msg . args)
+  (apply error msg args))
+
+
+; For parsing circular structures, we keep track of definitions in a
+; hash-map that maps the id's to their values.
+; When defining a new id, though, we immediatly fill the slot with a promise
+; before parsing and setting the real value, because it must already be
+; available at that time in case of a circular reference.  The promise refers
+; to a local variable that will be set when the real value is available through
+; a closure.  After parsing the expression is completed, we work through it
+; again and force all promises we find.
+; The definitions themselves are stored in a fluid and their scope is one
+; call to read-elisp (but not only the currently parsed expression!).
+
+(define circular-definitions (make-fluid))
+
+(define (make-circular-definitions)
+  (make-hash-table))
+
+(define (circular-ref token)
+  (if (not (eq? (car token) 'circular-ref))
+    (error "invalid token for circular-ref" token))
+  (let* ((id (cdr token))
+         (value (hashq-ref (fluid-ref circular-definitions) id)))
+    (if value
+      value
+      (parse-error token "undefined circular reference" id))))
+
+; Returned is a closure that, when invoked, will set the final value.
+; This means both the variable the promise will return and the hash-table
+; slot so we don't generate promises any longer.
+(define (circular-define! token)
+  (if (not (eq? (car token) 'circular-def))
+    (error "invalid token for circular-define!" token))
+  (let ((value #f)
+        (table (fluid-ref circular-definitions))
+        (id (cdr token)))
+    (hashq-set! table id (delay value))
+    (lambda (real-value)
+      (set! value real-value)
+      (hashq-set! table id real-value))))
+
+; Work through a parsed data structure and force the promises there.
+; After a promise is forced, the resulting value must not be recursed on;
+; this may lead to infinite recursion with a circular structure, and
+; additionally this value was already processed when it was defined.
+; All deep data structures that can be parsed must be handled here!
+(define (force-promises! data)
+  (cond
+    ((pair? data)
+     (begin
+       (if (promise? (car data))
+         (set-car! data (force (car data)))
+         (force-promises! (car data)))
+       (if (promise? (cdr data))
+         (set-cdr! data (force (cdr data)))
+         (force-promises! (cdr data)))))
+    ((vector? data)
+     (let ((len (vector-length data)))
+       (let iterate ((i 0))
+         (if (< i len)
+           (let ((el (vector-ref data i)))
+             (if (promise? el)
+               (vector-set! data i (force el))
+               (force-promises! el))
+             (iterate (1+ i)))))))
+    ; Else nothing needs to be done.
+  ))
+
+
+; We need peek-functionality for the next lexer token, this is done with some
+; single token look-ahead storage.  This is handled by a closure which allows
+; getting or peeking the next token.
+; When one expression is fully parsed, we don't want a look-ahead stored here
+; because it would miss from future parsing.  This is verified by the finish
+; action.
+
+(define (make-lexer-buffer lex)
+  (let ((look-ahead #f))
+    (lambda (action)
+      (if (eq? action 'finish)
+        (if look-ahead
+          (error "lexer-buffer is not empty when finished")
+          #f)
+        (begin
+          (if (not look-ahead)
+            (set! look-ahead (lex)))
+          (case action
+            ((peek) look-ahead)
+            ((get)
+             (let ((result look-ahead))
+               (set! look-ahead #f)
+               result))
+            (else (error "invalid lexer-buffer action" action))))))))
+
+
+; Get the contents of a list, where the opening parentheses has already been
+; found.  The same code is used for vectors and lists, where lists allow the
+; dotted tail syntax and vectors not; additionally, the closing parenthesis
+; must of course match.
+; The implementation here is not tail-recursive, but I think it is clearer
+; and simpler this way.
+
+(define (get-list lex allow-dot close-square)
+  (let* ((next (lex 'peek))
+         (type (car next)))
+    (cond
+      ((eq? type (if close-square 'square-close 'paren-close))
+       (begin
+         (if (not (eq? (car (lex 'get)) type))
+           (error "got different token than peeked"))
+         '()))
+      ((and allow-dot (eq? type 'dot))
+       (begin
+         (if (not (eq? (car (lex 'get)) type))
+           (error "got different token than peeked"))
+         (let ((tail (get-list lex #f close-square)))
+           (if (not (= (length tail) 1))
+             (parse-error next "expected exactly one element after dot"))
+           (car tail))))
+      (else
+        ; Do both parses in exactly this sequence!
+        (let* ((head (get-expression lex))
+               (tail (get-list lex allow-dot close-square)))
+          (cons head tail))))))
+
+
+
+; Parse a single expression from a lexer-buffer.  This is the main routine in
+; our recursive-descent parser.
+
+(define quotation-symbols '((quote . quote)
+                            (backquote . \`)
+                            (unquote . \,)
+                            (unquote-splicing . \,@)))
+
+(define (get-expression lex)
+  (let* ((token (lex 'get))
+         (type (car token))
+         (return (lambda (result)
+                   (if (pair? result)
+                     (set-source-properties! result (source-properties token)))
+                   result)))
+    (case type
+      ((integer float symbol character string)
+       (return (cdr token)))
+      ((quote backquote unquote unquote-splicing)
+       (return (list (assq-ref quotation-symbols type) (get-expression lex))))
+      ((paren-open)
+       (return (get-list lex #t #f)))
+      ((square-open)
+       (return (list->vector (get-list lex #f #t))))
+      ((circular-ref)
+       (circular-ref token))
+      ((circular-def)
+       ; The order of definitions is important!
+       (let* ((setter (circular-define! token))
+              (expr (get-expression lex)))
+         (setter expr)
+         (force-promises! expr)
+         expr))
+      (else
+        (parse-error token "expected expression, got" token)))))
+
+
+; Define the reader function based on this; build a lexer, a lexer-buffer,
+; and then parse a single expression to return.
+; We also define a circular-definitions data structure to use.
+
+(define (read-elisp port)
+  (with-fluids ((circular-definitions (make-circular-definitions)))
+    (let* ((lexer (get-lexer port))
+           (lexbuf (make-lexer-buffer lexer))
+           (result (get-expression lexbuf)))
+      (lexbuf 'finish)
+      result)))