1 ; "prec.scm", dynamically extensible parser/tokenizer -*-scheme-*-
2 ; Copyright 1989, 1990, 1991, 1992, 1993, 1995, 1997 Aubrey Jaffer.
4 ;Permission to copy this software, to redistribute it, and to use it
5 ;for any purpose is granted, subject to the following restrictions and
8 ;1. Any copy made of this software must include this copyright notice
11 ;2. I have made no warrantee or representation that the operation of
12 ;this software will be error-free, and I am under no obligation to
13 ;provide any services, by way of maintenance, update, or otherwise.
15 ;3. In conjunction with products arising from the use of this
16 ;material, there shall be no use of my name in any advertising,
17 ;promotional, or sales literature without prior written consent in
20 ; This file implements:
21 ; * a Pratt style parser.
22 ; * a tokenizer which congeals tokens according to assigned classes of
23 ; constituent characters.
25 ; This module is a significant improvement because grammar can be
26 ; changed dynamically from rulesets which don't need compilation.
27 ; Theoretically, all possibilities of bad input are handled and return
28 ; as much structure as was parsed when the error occured; The symbol
29 ; `?' is substituted for missing input.
31 ; References for the parser are:
34 ; Top Down Operator Precendence.
36 ; Symposium on Principles of Programming Languages,
40 ; CGOL - an Alternative External Representation For LISP users
42 ; MIT Artificial Intelligence Lab.
46 ; MACSYMA Reference Manual, Version Ten,
47 ; Laboratory for Computer Science, MIT, 1983
50 (require 'string-search)
51 (require 'string-port)
54 (define *syn-defs* #f)
55 (define *syn-rules* #f) ;Dynamically bound
56 (define *prec:port* #f) ;Dynamically bound
58 ;; keeps track of input column so we can generate useful error displays.
60 (define (tok:peek-char) (peek-char *prec:port*))
61 (define (tok:read-char)
62 (let ((c (read-char *prec:port*)))
63 (if (or (eqv? c #\newline) (eof-object? c))
65 (set! tok:column (+ 1 tok:column)))
67 (define (tok:bump-column pos . ports)
69 (cond ((null? ports) (thunk))
70 (else (fluid-let ((*prec:port* (car ports))) (thunk)))))
72 (cond ((eqv? #\newline (tok:peek-char))
73 (tok:read-char))) ;to do newline
74 (set! tok:column (+ tok:column pos)))))
75 (define (prec:warn . msgs)
76 (do ((j (+ -1 tok:column) (+ -8 j)))
84 (for-each (lambda (x) (write x) (display #\ )) msgs)
87 ;; Structure of lexical records.
88 (define tok:make-rec cons)
92 (define (tok:lookup alist char)
93 (if (eof-object? char)
95 (let ((pair (assv char alist)))
96 (and pair (cdr pair)))))
98 (define (tok:char-group group chars chars-proc)
100 ;;; (let ((oldlexrec (tok:lookup *syn-defs* token)))
101 ;;; (cond ((or (not oldlexrec) (eqv? (tok:cc oldlexrec) group)))
102 ;;; (else (math:warn 'cc-of token 'redefined-to- group))))
103 (cons token (tok:make-rec group chars-proc)))
104 (cond ((string? chars) (string->list chars))
105 ((char? chars) (list chars))
109 (let* ((char (tok:read-char))
110 (rec (tok:lookup *syn-rules* char))
111 (proc (and rec (tok:cc rec)))
116 (do ((cl clist (begin (set-cdr! cl (list (tok:read-char))) (cdr cl))))
117 ((proc (tok:peek-char))
118 ((or (tok:sfp rec) list->string) clist))))
119 ((eqv? 0 proc) (tokenize))
121 (do ((cl clist (begin (set-cdr! cl (list (tok:read-char))) (cdr cl))))
122 ((not (let* ((prec (tok:lookup *syn-rules* (tok:peek-char)))
123 (cclass (and prec (tok:cc prec))))
124 (or (eqv? cclass proc)
125 (eqv? cclass (+ -1 proc)))))
126 ((tok:sfp rec) clist)))))))
128 ;;; PREC:NUD is the null denotation (function and arguments to call when no
129 ;;; unclaimed tokens).
130 ;;; PREC:LED is the left denotation (function and arguments to call when
131 ;;; unclaimed token is on left).
132 ;;; PREC:LBP is the left binding power of this LED. It is the first
133 ;;; argument position of PREC:LED
135 (define (prec:nudf alist self)
136 (let ((pair (assoc (cons 'nud self) alist)))
137 (and pair (cdr pair))))
138 (define (prec:ledf alist self)
139 (let ((pair (assoc (cons 'led self) alist)))
140 (and pair (cdr pair))))
141 (define (prec:lbp alist self)
142 (let ((pair (assoc (cons 'led self) alist)))
143 (and pair (cadr pair))))
145 (define (prec:call-or-list proc . args)
146 (prec:apply-or-cons proc args))
147 (define (prec:apply-or-cons proc args)
148 (if (procedure? proc) (apply proc args) (cons (or proc '?) args)))
150 ;;; PREC:SYMBOLFY and PREC:DE-SYMBOLFY are not exact inverses.
151 (define (prec:symbolfy obj)
152 (cond ((symbol? obj) obj)
153 ((string? obj) (string->symbol obj))
154 ((char? obj) (string->symbol (string obj)))
157 (define (prec:de-symbolfy obj)
158 (cond ((symbol? obj) (symbol->string obj))
161 ;;;Calls to set up tables.
163 (define (prec:define-grammar . synlsts)
164 (set! *syn-defs* (append (apply append synlsts) *syn-defs*)))
166 (define (prec:make-led toks . args)
168 (cons (cons 'led (prec:de-symbolfy tok))
170 (if (pair? toks) toks (list toks))))
171 (define (prec:make-nud toks . args)
173 (cons (cons 'nud (prec:de-symbolfy tok))
175 (if (pair? toks) toks (list toks))))
177 ;;; Produce dynamically augmented grammars.
178 (define (prec:process-binds binds rules)
179 (if (and #f (not (null? binds)) (eq? #t (car binds)))
181 (append binds rules)))
183 ;;(define (prec:replace-rules) some-sort-of-magic-cookie)
185 ;;; Here are the procedures to define high-level grammar, along with
186 ;;; utility functions called during parsing. The utility functions
187 ;;; (prec:parse-*) could be incorportated into the defining commands,
188 ;;; but tracing these functions is useful for debugging.
190 (define (prec:delim tk)
191 (prec:make-led tk 0 #f))
193 (define (prec:nofix tk sop . binds)
194 (prec:make-nud tk prec:parse-nofix sop (apply append binds)))
195 (define (prec:parse-nofix self sop binds)
196 (set! *syn-rules* (prec:process-binds binds *syn-rules*))
197 (prec:call-or-list (or sop (prec:symbolfy self))))
199 (define (prec:prefix tk sop bp . binds)
200 (prec:make-nud tk prec:parse-prefix sop bp (apply append binds)))
201 (define (prec:parse-prefix self sop bp binds)
202 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
203 (prec:call-or-list (or sop (prec:symbolfy self)) (prec:parse1 bp))))
205 (define (prec:infix tk sop lbp bp . binds)
206 (prec:make-led tk lbp prec:parse-infix sop bp (apply append binds)))
207 (define (prec:parse-infix left self lbp sop bp binds)
208 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
209 (prec:call-or-list (or sop (prec:symbolfy self)) left (prec:parse1 bp))))
211 (define (prec:nary tk sop bp)
212 (prec:make-led tk bp prec:parse-nary sop bp))
213 (define (prec:parse-nary left self lbp sop bp)
214 (prec:apply-or-cons (or sop (prec:symbolfy self))
215 (cons left (prec:parse-list self bp))))
217 (define (prec:postfix tk sop lbp . binds)
218 (prec:make-led tk lbp prec:parse-postfix sop (apply append binds)))
219 (define (prec:parse-postfix left self lbp sop binds)
220 (set! *syn-rules* (prec:process-binds binds *syn-rules*))
221 (prec:call-or-list (or sop (prec:symbolfy self)) left))
223 (define (prec:prestfix tk sop bp . binds)
224 (prec:make-nud tk prec:parse-rest sop bp (apply append binds)))
225 (define (prec:parse-rest self sop bp binds)
226 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
227 (prec:apply-or-cons (or sop (prec:symbolfy self)) (prec:parse-list #f bp))))
229 (define (prec:commentfix tk stp match . binds)
231 (prec:make-nud tk prec:parse-nudcomment stp match (apply append binds))
232 (prec:make-led tk 220 prec:parse-ledcomment stp match (apply append binds))))
233 (define (prec:parse-nudcomment self stp match binds)
234 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
235 (tok:read-through-comment stp match)
237 (cond ((prec:delim? (force prec:token)) #f)
238 (else (prec:parse1 prec:bp)))))
239 (define (prec:parse-ledcomment left lbp self stp match binds)
240 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
241 (tok:read-through-comment stp match)
244 (define (tok:read-through-comment stp match)
245 (set! match (if (char? match)
247 (prec:de-symbolfy match)))
248 (cond ((procedure? stp)
250 (str (call-with-output-string
252 (set! len (find-string-from-port?
254 (lambda (c) (display c sp) #f)))))))
255 (stp (and len (substring str 0 (- len (string-length match)))))))
256 (else (find-string-from-port? match *prec:port*))))
258 (define (prec:matchfix tk sop sep match . binds)
260 (prec:make-nud tk prec:parse-matchfix
261 sop sep-lbp sep match
262 (apply append (prec:delim match) binds)))
263 (define (prec:parse-matchfix self sop sep-lbp sep match binds)
264 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
265 (cond (sop (prec:apply-or-cons
266 sop (prec:parse-delimited sep sep-lbp match)))
267 ((equal? (force prec:token) match)
268 (prec:warn 'expression-missing)
271 (else (let ((ans (prec:parse1 0))) ;just parenthesized expression
272 (cond ((equal? (force prec:token) match)
274 ((prec:delim? (force prec:token))
275 (prec:warn 'mismatched-delimiter (force prec:token)
278 (else (prec:warn 'delimiter-expected--ignoring-rest
279 (force prec:token) 'expected match
281 (do () ((prec:delim? (force prec:token)))
285 (define (prec:inmatchfix tk sop sep match lbp . binds)
287 (prec:make-led tk lbp prec:parse-inmatchfix
288 sop sep-lbp sep match
289 (apply append (prec:delim match) binds)))
290 (define (prec:parse-inmatchfix left self lbp sop sep-lbp sep match binds)
291 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
293 sop (cons left (prec:parse-delimited sep sep-lbp match)))))
295 ;;;; Here is the code which actually parses.
297 (define prec:bp #f) ;dynamically bound
298 (define prec:token #f)
299 (define (prec:advance)
300 (set! prec:token (delay (tokenize))))
301 (define (prec:advance-return-last)
302 (let ((last (and prec:token (force prec:token))))
306 (define (prec:nudcall self)
307 (let ((pob (prec:nudf *syn-rules* self)))
309 (pob (let ((proc (car pob)))
310 (cond ((procedure? proc) (apply proc self (cdr pob)))
311 (proc (cons proc (cdr pob)))
313 ((char? self) (prec:warn 'extra-separator)
315 (prec:nudcall (force prec:token)))
316 ((string? self) (string->symbol self))
319 (define (prec:ledcall left self)
320 (let* ((pob (prec:ledf *syn-rules* self)))
321 (apply (cadr pob) left self (cdr pob))))
323 ;;; PREC:PARSE1 is the heart.
324 (define (prec:parse1 bp)
325 (fluid-let ((prec:bp bp))
326 (do ((left (prec:nudcall (prec:advance-return-last))
327 (prec:ledcall left (prec:advance-return-last))))
328 ((or (>= bp 200) ;to avoid unneccesary lookahead
329 (>= bp (or (prec:lbp *syn-rules* (force prec:token)) 0))
333 (define (prec:delim? token)
334 (or (eof-object? token) (<= (or (prec:lbp *syn-rules* token) 220) 0)))
336 (define (prec:parse-list sep bp)
337 (cond ((prec:delim? (force prec:token))
338 (prec:warn 'expression-missing)
341 (let ((f (prec:parse1 bp)))
342 (cons f (cond ((equal? (force prec:token) sep)
344 (cond ((equal? (force prec:token) sep)
345 (prec:warn 'expression-missing)
347 (cons '? (prec:parse-list sep bp)))
348 ((prec:delim? (force prec:token))
349 (prec:warn 'expression-missing)
351 (else (prec:parse-list sep bp))))
352 ((prec:delim? (force prec:token)) '())
353 ((not sep) (prec:parse-list sep bp))
354 ((prec:delim? sep) (prec:warn 'separator-missing)
355 (prec:parse-list sep bp))
358 (define (prec:parse-delimited sep bp delim)
359 (cond ((equal? (force prec:token) sep)
360 (prec:warn 'expression-missing)
362 (cons '? (prec:parse-delimited sep delim)))
363 ((prec:delim? (force prec:token))
364 (if (not (equal? (force prec:token) delim))
365 (prec:warn 'mismatched-delimiter (force prec:token)
367 (if (not sep) (prec:warn 'expression-missing))
370 (else (let ((ans (prec:parse-list sep bp)))
371 (cond ((equal? (force prec:token) delim))
372 ((prec:delim? (force prec:token))
373 (prec:warn 'mismatched-delimiter (force prec:token)
375 (else (prec:warn 'delimiter-expected--ignoring-rest
376 (force prec:token) '...)
377 (do () ((prec:delim? (force prec:token)))
382 (define (prec:parse grammar delim . port)
383 (set! delim (prec:de-symbolfy delim))
384 (fluid-let ((*syn-rules* (append (prec:delim delim) grammar))
385 (*prec:port* (if (null? port) (current-input-port) (car port))))
386 (prec:advance) ; setup prec:token with first token
387 (cond ((eof-object? (force prec:token)) (force prec:token))
388 ((equal? (force prec:token) delim) #f)
390 (let ((ans (prec:parse1 0)))
391 (cond ((eof-object? (force prec:token)))
392 ((equal? (force prec:token) delim))
393 (else (prec:warn 'delimiter-expected--ignoring-rest
394 (force prec:token) 'not delim)
395 (do () ((or (equal? (force prec:token) delim)
396 (eof-object? (force prec:token))))
400 (define tok:decimal-digits "0123456789")
401 (define tok:upper-case "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
402 (define tok:lower-case "abcdefghijklmnopqrstuvwxyz")
403 (define tok:whitespaces
404 (do ((i (+ -1 (min 256 char-code-limit)) (+ -1 i))
405 (ws "" (if (char-whitespace? (integer->char i))
406 (string-append ws (string (integer->char i)))
410 ;;;;The parse tables.
411 ;;; Definitions accumulate in top-level variable *SYN-DEFS*.
412 (set! *syn-defs* '()) ;Make sure *SYN-DEFS* is empty.
414 ;;; Ignore Whitespace characters.
415 (prec:define-grammar (tok:char-group 0 tok:whitespaces #f))
417 ;;; On MS-DOS systems, <ctrl>-Z (26) needs to be ignored in order to
418 ;;; avoid problems at end of files.
419 (case (software-type)
421 (if (not (char-whitespace? (integer->char 26)))
422 (prec:define-grammar (tok:char-group 0 (integer->char 26) #f))
425 ;;; Save these convenient definitions.
426 (define *syn-ignore-whitespace* *syn-defs*)
427 (set! *syn-defs* '())
431 (trace prec:parse prec:parse1
432 prec:parse-delimited prec:parse-list
433 prec:call-or-list prec:apply-or-cons
434 ;;tokenize prec:advance-return-last prec:advance
435 prec:nudcall prec:ledcall
436 prec:parse-nudcomment prec:parse-ledcomment
437 prec:parse-delimited prec:parse-list
438 prec:parse-nary prec:parse-rest
439 prec:parse-matchfix prec:parse-inmatchfix
440 prec:parse-prefix prec:parse-infix prec:parse-postfix
442 ;;prec:ledf prec:nudf prec:lbp
444 (set! *qp-width* 333))
446 ;;(begin (trace-all "prec.scm") (set! *qp-width* 333))
447 ;;(pretty-print (grammar-read-tab (get-grammar 'standard)))