add env script
[bpt/guile.git] / module / slib / prec.scm
1 ; "prec.scm", dynamically extensible parser/tokenizer -*-scheme-*-
2 ; Copyright 1989, 1990, 1991, 1992, 1993, 1995, 1997 Aubrey Jaffer.
3 ;
4 ;Permission to copy this software, to redistribute it, and to use it
5 ;for any purpose is granted, subject to the following restrictions and
6 ;understandings.
7 ;
8 ;1. Any copy made of this software must include this copyright notice
9 ;in full.
10 ;
11 ;2. I have made no warrantee or representation that the operation of
12 ;this software will be error-free, and I am under no obligation to
13 ;provide any services, by way of maintenance, update, or otherwise.
14 ;
15 ;3. In conjunction with products arising from the use of this
16 ;material, there shall be no use of my name in any advertising,
17 ;promotional, or sales literature without prior written consent in
18 ;each case.
19
20 ; This file implements:
21 ; * a Pratt style parser.
22 ; * a tokenizer which congeals tokens according to assigned classes of
23 ; constituent characters.
24 ;
25 ; This module is a significant improvement because grammar can be
26 ; changed dynamically from rulesets which don't need compilation.
27 ; Theoretically, all possibilities of bad input are handled and return
28 ; as much structure as was parsed when the error occured; The symbol
29 ; `?' is substituted for missing input.
30
31 ; References for the parser are:
32
33 ; Pratt, V. R.
34 ; Top Down Operator Precendence.
35 ; SIGACT/SIGPLAN
36 ; Symposium on Principles of Programming Languages,
37 ; Boston, 1973, 41-51
38
39 ; WORKING PAPER 121
40 ; CGOL - an Alternative External Representation For LISP users
41 ; Vaughan R. Pratt
42 ; MIT Artificial Intelligence Lab.
43 ; March 1976
44
45 ; Mathlab Group,
46 ; MACSYMA Reference Manual, Version Ten,
47 ; Laboratory for Computer Science, MIT, 1983
48
49 (require 'fluid-let)
50 (require 'string-search)
51 (require 'string-port)
52 (require 'delay)
53
54 (define *syn-defs* #f)
55 (define *syn-rules* #f) ;Dynamically bound
56 (define *prec:port* #f) ;Dynamically bound
57
58 ;; keeps track of input column so we can generate useful error displays.
59 (define tok:column 0)
60 (define (tok:peek-char) (peek-char *prec:port*))
61 (define (tok:read-char)
62 (let ((c (read-char *prec:port*)))
63 (if (or (eqv? c #\newline) (eof-object? c))
64 (set! tok:column 0)
65 (set! tok:column (+ 1 tok:column)))
66 c))
67 (define (tok:bump-column pos . ports)
68 ((lambda (thunk)
69 (cond ((null? ports) (thunk))
70 (else (fluid-let ((*prec:port* (car ports))) (thunk)))))
71 (lambda ()
72 (cond ((eqv? #\newline (tok:peek-char))
73 (tok:read-char))) ;to do newline
74 (set! tok:column (+ tok:column pos)))))
75 (define (prec:warn . msgs)
76 (do ((j (+ -1 tok:column) (+ -8 j)))
77 ((> 8 j)
78 (do ((i j (+ -1 i)))
79 ((>= 0 i))
80 (display #\ )))
81 (display slib:tab))
82 (display "^ ")
83 (newline)
84 (for-each (lambda (x) (write x) (display #\ )) msgs)
85 (newline))
86
87 ;; Structure of lexical records.
88 (define tok:make-rec cons)
89 (define tok:cc car)
90 (define tok:sfp cdr)
91
92 (define (tok:lookup alist char)
93 (if (eof-object? char)
94 #f
95 (let ((pair (assv char alist)))
96 (and pair (cdr pair)))))
97
98 (define (tok:char-group group chars chars-proc)
99 (map (lambda (token)
100 ;;; (let ((oldlexrec (tok:lookup *syn-defs* token)))
101 ;;; (cond ((or (not oldlexrec) (eqv? (tok:cc oldlexrec) group)))
102 ;;; (else (math:warn 'cc-of token 'redefined-to- group))))
103 (cons token (tok:make-rec group chars-proc)))
104 (cond ((string? chars) (string->list chars))
105 ((char? chars) (list chars))
106 (else chars))))
107
108 (define (tokenize)
109 (let* ((char (tok:read-char))
110 (rec (tok:lookup *syn-rules* char))
111 (proc (and rec (tok:cc rec)))
112 (clist (list char)))
113 (cond
114 ((not proc) char)
115 ((procedure? proc)
116 (do ((cl clist (begin (set-cdr! cl (list (tok:read-char))) (cdr cl))))
117 ((proc (tok:peek-char))
118 ((or (tok:sfp rec) list->string) clist))))
119 ((eqv? 0 proc) (tokenize))
120 (else
121 (do ((cl clist (begin (set-cdr! cl (list (tok:read-char))) (cdr cl))))
122 ((not (let* ((prec (tok:lookup *syn-rules* (tok:peek-char)))
123 (cclass (and prec (tok:cc prec))))
124 (or (eqv? cclass proc)
125 (eqv? cclass (+ -1 proc)))))
126 ((tok:sfp rec) clist)))))))
127
128 ;;; PREC:NUD is the null denotation (function and arguments to call when no
129 ;;; unclaimed tokens).
130 ;;; PREC:LED is the left denotation (function and arguments to call when
131 ;;; unclaimed token is on left).
132 ;;; PREC:LBP is the left binding power of this LED. It is the first
133 ;;; argument position of PREC:LED
134
135 (define (prec:nudf alist self)
136 (let ((pair (assoc (cons 'nud self) alist)))
137 (and pair (cdr pair))))
138 (define (prec:ledf alist self)
139 (let ((pair (assoc (cons 'led self) alist)))
140 (and pair (cdr pair))))
141 (define (prec:lbp alist self)
142 (let ((pair (assoc (cons 'led self) alist)))
143 (and pair (cadr pair))))
144
145 (define (prec:call-or-list proc . args)
146 (prec:apply-or-cons proc args))
147 (define (prec:apply-or-cons proc args)
148 (if (procedure? proc) (apply proc args) (cons (or proc '?) args)))
149
150 ;;; PREC:SYMBOLFY and PREC:DE-SYMBOLFY are not exact inverses.
151 (define (prec:symbolfy obj)
152 (cond ((symbol? obj) obj)
153 ((string? obj) (string->symbol obj))
154 ((char? obj) (string->symbol (string obj)))
155 (else obj)))
156
157 (define (prec:de-symbolfy obj)
158 (cond ((symbol? obj) (symbol->string obj))
159 (else obj)))
160
161 ;;;Calls to set up tables.
162
163 (define (prec:define-grammar . synlsts)
164 (set! *syn-defs* (append (apply append synlsts) *syn-defs*)))
165
166 (define (prec:make-led toks . args)
167 (map (lambda (tok)
168 (cons (cons 'led (prec:de-symbolfy tok))
169 args))
170 (if (pair? toks) toks (list toks))))
171 (define (prec:make-nud toks . args)
172 (map (lambda (tok)
173 (cons (cons 'nud (prec:de-symbolfy tok))
174 args))
175 (if (pair? toks) toks (list toks))))
176
177 ;;; Produce dynamically augmented grammars.
178 (define (prec:process-binds binds rules)
179 (if (and #f (not (null? binds)) (eq? #t (car binds)))
180 (cdr binds)
181 (append binds rules)))
182
183 ;;(define (prec:replace-rules) some-sort-of-magic-cookie)
184
185 ;;; Here are the procedures to define high-level grammar, along with
186 ;;; utility functions called during parsing. The utility functions
187 ;;; (prec:parse-*) could be incorportated into the defining commands,
188 ;;; but tracing these functions is useful for debugging.
189
190 (define (prec:delim tk)
191 (prec:make-led tk 0 #f))
192
193 (define (prec:nofix tk sop . binds)
194 (prec:make-nud tk prec:parse-nofix sop (apply append binds)))
195 (define (prec:parse-nofix self sop binds)
196 (set! *syn-rules* (prec:process-binds binds *syn-rules*))
197 (prec:call-or-list (or sop (prec:symbolfy self))))
198
199 (define (prec:prefix tk sop bp . binds)
200 (prec:make-nud tk prec:parse-prefix sop bp (apply append binds)))
201 (define (prec:parse-prefix self sop bp binds)
202 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
203 (prec:call-or-list (or sop (prec:symbolfy self)) (prec:parse1 bp))))
204
205 (define (prec:infix tk sop lbp bp . binds)
206 (prec:make-led tk lbp prec:parse-infix sop bp (apply append binds)))
207 (define (prec:parse-infix left self lbp sop bp binds)
208 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
209 (prec:call-or-list (or sop (prec:symbolfy self)) left (prec:parse1 bp))))
210
211 (define (prec:nary tk sop bp)
212 (prec:make-led tk bp prec:parse-nary sop bp))
213 (define (prec:parse-nary left self lbp sop bp)
214 (prec:apply-or-cons (or sop (prec:symbolfy self))
215 (cons left (prec:parse-list self bp))))
216
217 (define (prec:postfix tk sop lbp . binds)
218 (prec:make-led tk lbp prec:parse-postfix sop (apply append binds)))
219 (define (prec:parse-postfix left self lbp sop binds)
220 (set! *syn-rules* (prec:process-binds binds *syn-rules*))
221 (prec:call-or-list (or sop (prec:symbolfy self)) left))
222
223 (define (prec:prestfix tk sop bp . binds)
224 (prec:make-nud tk prec:parse-rest sop bp (apply append binds)))
225 (define (prec:parse-rest self sop bp binds)
226 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
227 (prec:apply-or-cons (or sop (prec:symbolfy self)) (prec:parse-list #f bp))))
228
229 (define (prec:commentfix tk stp match . binds)
230 (append
231 (prec:make-nud tk prec:parse-nudcomment stp match (apply append binds))
232 (prec:make-led tk 220 prec:parse-ledcomment stp match (apply append binds))))
233 (define (prec:parse-nudcomment self stp match binds)
234 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
235 (tok:read-through-comment stp match)
236 (prec:advance)
237 (cond ((prec:delim? (force prec:token)) #f)
238 (else (prec:parse1 prec:bp)))))
239 (define (prec:parse-ledcomment left lbp self stp match binds)
240 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
241 (tok:read-through-comment stp match)
242 (prec:advance)
243 left))
244 (define (tok:read-through-comment stp match)
245 (set! match (if (char? match)
246 (string match)
247 (prec:de-symbolfy match)))
248 (cond ((procedure? stp)
249 (let* ((len #f)
250 (str (call-with-output-string
251 (lambda (sp)
252 (set! len (find-string-from-port?
253 match *prec:port*
254 (lambda (c) (display c sp) #f)))))))
255 (stp (and len (substring str 0 (- len (string-length match)))))))
256 (else (find-string-from-port? match *prec:port*))))
257
258 (define (prec:matchfix tk sop sep match . binds)
259 (define sep-lbp 0)
260 (prec:make-nud tk prec:parse-matchfix
261 sop sep-lbp sep match
262 (apply append (prec:delim match) binds)))
263 (define (prec:parse-matchfix self sop sep-lbp sep match binds)
264 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
265 (cond (sop (prec:apply-or-cons
266 sop (prec:parse-delimited sep sep-lbp match)))
267 ((equal? (force prec:token) match)
268 (prec:warn 'expression-missing)
269 (prec:advance)
270 '?)
271 (else (let ((ans (prec:parse1 0))) ;just parenthesized expression
272 (cond ((equal? (force prec:token) match)
273 (prec:advance))
274 ((prec:delim? (force prec:token))
275 (prec:warn 'mismatched-delimiter (force prec:token)
276 'not match)
277 (prec:advance))
278 (else (prec:warn 'delimiter-expected--ignoring-rest
279 (force prec:token) 'expected match
280 'or-delimiter)
281 (do () ((prec:delim? (force prec:token)))
282 (prec:parse1 0))))
283 ans)))))
284
285 (define (prec:inmatchfix tk sop sep match lbp . binds)
286 (define sep-lbp 0)
287 (prec:make-led tk lbp prec:parse-inmatchfix
288 sop sep-lbp sep match
289 (apply append (prec:delim match) binds)))
290 (define (prec:parse-inmatchfix left self lbp sop sep-lbp sep match binds)
291 (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
292 (prec:apply-or-cons
293 sop (cons left (prec:parse-delimited sep sep-lbp match)))))
294
295 ;;;; Here is the code which actually parses.
296
297 (define prec:bp #f) ;dynamically bound
298 (define prec:token #f)
299 (define (prec:advance)
300 (set! prec:token (delay (tokenize))))
301 (define (prec:advance-return-last)
302 (let ((last (and prec:token (force prec:token))))
303 (prec:advance)
304 last))
305
306 (define (prec:nudcall self)
307 (let ((pob (prec:nudf *syn-rules* self)))
308 (cond
309 (pob (let ((proc (car pob)))
310 (cond ((procedure? proc) (apply proc self (cdr pob)))
311 (proc (cons proc (cdr pob)))
312 (else '?))))
313 ((char? self) (prec:warn 'extra-separator)
314 (prec:advance)
315 (prec:nudcall (force prec:token)))
316 ((string? self) (string->symbol self))
317 (else self))))
318
319 (define (prec:ledcall left self)
320 (let* ((pob (prec:ledf *syn-rules* self)))
321 (apply (cadr pob) left self (cdr pob))))
322
323 ;;; PREC:PARSE1 is the heart.
324 (define (prec:parse1 bp)
325 (fluid-let ((prec:bp bp))
326 (do ((left (prec:nudcall (prec:advance-return-last))
327 (prec:ledcall left (prec:advance-return-last))))
328 ((or (>= bp 200) ;to avoid unneccesary lookahead
329 (>= bp (or (prec:lbp *syn-rules* (force prec:token)) 0))
330 (not left))
331 left))))
332
333 (define (prec:delim? token)
334 (or (eof-object? token) (<= (or (prec:lbp *syn-rules* token) 220) 0)))
335
336 (define (prec:parse-list sep bp)
337 (cond ((prec:delim? (force prec:token))
338 (prec:warn 'expression-missing)
339 '(?))
340 (else
341 (let ((f (prec:parse1 bp)))
342 (cons f (cond ((equal? (force prec:token) sep)
343 (prec:advance)
344 (cond ((equal? (force prec:token) sep)
345 (prec:warn 'expression-missing)
346 (prec:advance)
347 (cons '? (prec:parse-list sep bp)))
348 ((prec:delim? (force prec:token))
349 (prec:warn 'expression-missing)
350 '(?))
351 (else (prec:parse-list sep bp))))
352 ((prec:delim? (force prec:token)) '())
353 ((not sep) (prec:parse-list sep bp))
354 ((prec:delim? sep) (prec:warn 'separator-missing)
355 (prec:parse-list sep bp))
356 (else '())))))))
357
358 (define (prec:parse-delimited sep bp delim)
359 (cond ((equal? (force prec:token) sep)
360 (prec:warn 'expression-missing)
361 (prec:advance)
362 (cons '? (prec:parse-delimited sep delim)))
363 ((prec:delim? (force prec:token))
364 (if (not (equal? (force prec:token) delim))
365 (prec:warn 'mismatched-delimiter (force prec:token)
366 'expected delim))
367 (if (not sep) (prec:warn 'expression-missing))
368 (prec:advance)
369 (if sep '() '(?)))
370 (else (let ((ans (prec:parse-list sep bp)))
371 (cond ((equal? (force prec:token) delim))
372 ((prec:delim? (force prec:token))
373 (prec:warn 'mismatched-delimiter (force prec:token)
374 'expecting delim))
375 (else (prec:warn 'delimiter-expected--ignoring-rest
376 (force prec:token) '...)
377 (do () ((prec:delim? (force prec:token)))
378 (prec:parse1 bp))))
379 (prec:advance)
380 ans))))
381
382 (define (prec:parse grammar delim . port)
383 (set! delim (prec:de-symbolfy delim))
384 (fluid-let ((*syn-rules* (append (prec:delim delim) grammar))
385 (*prec:port* (if (null? port) (current-input-port) (car port))))
386 (prec:advance) ; setup prec:token with first token
387 (cond ((eof-object? (force prec:token)) (force prec:token))
388 ((equal? (force prec:token) delim) #f)
389 (else
390 (let ((ans (prec:parse1 0)))
391 (cond ((eof-object? (force prec:token)))
392 ((equal? (force prec:token) delim))
393 (else (prec:warn 'delimiter-expected--ignoring-rest
394 (force prec:token) 'not delim)
395 (do () ((or (equal? (force prec:token) delim)
396 (eof-object? (force prec:token))))
397 (prec:advance))))
398 ans)))))
399
400 (define tok:decimal-digits "0123456789")
401 (define tok:upper-case "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
402 (define tok:lower-case "abcdefghijklmnopqrstuvwxyz")
403 (define tok:whitespaces
404 (do ((i (+ -1 (min 256 char-code-limit)) (+ -1 i))
405 (ws "" (if (char-whitespace? (integer->char i))
406 (string-append ws (string (integer->char i)))
407 ws)))
408 ((negative? i) ws)))
409
410 ;;;;The parse tables.
411 ;;; Definitions accumulate in top-level variable *SYN-DEFS*.
412 (set! *syn-defs* '()) ;Make sure *SYN-DEFS* is empty.
413
414 ;;; Ignore Whitespace characters.
415 (prec:define-grammar (tok:char-group 0 tok:whitespaces #f))
416
417 ;;; On MS-DOS systems, <ctrl>-Z (26) needs to be ignored in order to
418 ;;; avoid problems at end of files.
419 (case (software-type)
420 ((MSDOS)
421 (if (not (char-whitespace? (integer->char 26)))
422 (prec:define-grammar (tok:char-group 0 (integer->char 26) #f))
423 )))
424
425 ;;; Save these convenient definitions.
426 (define *syn-ignore-whitespace* *syn-defs*)
427 (set! *syn-defs* '())
428
429 (define (prec:trace)
430 (require 'trace)
431 (trace prec:parse prec:parse1
432 prec:parse-delimited prec:parse-list
433 prec:call-or-list prec:apply-or-cons
434 ;;tokenize prec:advance-return-last prec:advance
435 prec:nudcall prec:ledcall
436 prec:parse-nudcomment prec:parse-ledcomment
437 prec:parse-delimited prec:parse-list
438 prec:parse-nary prec:parse-rest
439 prec:parse-matchfix prec:parse-inmatchfix
440 prec:parse-prefix prec:parse-infix prec:parse-postfix
441 ;;prec:delim?
442 ;;prec:ledf prec:nudf prec:lbp
443 )
444 (set! *qp-width* 333))
445
446 ;;(begin (trace-all "prec.scm") (set! *qp-width* 333))
447 ;;(pretty-print (grammar-read-tab (get-grammar 'standard)))
448 ;;(prec:trace)