add PEG parser generator
[bpt/guile.git] / module / ice-9 / peg.scm
1 (define-module (ice-9 peg)
2 :export (peg-sexp-compile peg-string-compile context-flatten peg-parse define-nonterm define-nonterm-f peg-match get-code define-grammar define-grammar-f peg:start peg:end peg:string peg:tree peg:substring peg-record? keyword-flatten)
3 :autoload (ice-9 pretty-print) (peg-sexp-compile peg-string-compile context-flatten peg-parse define-nonterm define-nonterm-f peg-match get-code define-grammar define-grammar-f keyword-flatten)
4 :use-module (ice-9 pretty-print))
5
6 (use-modules (ice-9 pretty-print))
7
8 (eval-when (compile load eval)
9
10 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
11 ;;;;; CONVENIENCE MACROS
12 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
13
14 (define (eeval exp)
15 (eval exp (interaction-environment)))
16
17 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
18 ;;;;; MACRO BUILDERS
19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
20
21 ;; Safe-bind helps to bind macros safely.
22 ;; e.g.:
23 ;; (safe-bind
24 ;; (a b)
25 ;; `(,a ,b))
26 ;; gives:
27 ;; (#<uninterned-symbol a cc608d0> #<uninterned-symbol b cc608a0>)
28 (define-syntax safe-bind
29 (lambda (x)
30 (syntax-case x ()
31 ((_ vals . actions)
32 (datum->syntax x (apply safe-bind-f
33 (cons
34 (syntax->datum #'vals)
35 (syntax->datum #'actions))))))))
36 ;; (define-macro (safe-bind vals . actions)
37 ;; (apply safe-bind-f (cons vals actions)))
38 (define (safe-bind-f vals . actions)
39 `(let ,(map (lambda (val) `(,val (make-symbol ,(symbol->string val)))) vals)
40 ,@actions))
41
42 ;; Unsafe-bind is like safe-bind but uses symbols that are easier to read while
43 ;; debugging rather than safe ones. Currently unused.
44 ;; (define-macro (unsafe-bind vals . actions)
45 ;; (apply unsafe-bind-f (cons vals actions)))
46 ;; (define (unsafe-bind-f vals . actions)
47 ;; `(let ,(map (lambda (val) `(,val ',val)) vals)
48 ;; ,@actions))
49
50 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
51 ;;;;; LOOPING CONSTRUCTS
52 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53
54 ;; Perform ACTION. If it succeeded, return its return value. If it failed, run
55 ;; IF_FAILS and try again
56 (define-syntax until-works
57 (lambda (x)
58 (syntax-case x ()
59 ((_ action if-fails)
60 #'(let ((retval action))
61 (while (not retval)
62 if-fails
63 (set! retval action))
64 retval)))))
65
66 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
67 ;;;;; GENERIC LIST-PROCESSING MACROS
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
69
70 ;; Return #t if the list has only one element (calling length all the time on
71 ;; potentially long lists was really slow).
72 (define-syntax single?
73 (lambda (x)
74 (syntax-case x ()
75 ((_ lst)
76 #'(and (list? lst) (not (null? lst)) (null? (cdr lst)))))))
77
78 ;; Push an object onto a list.
79 (define-syntax push!
80 (lambda (x)
81 (syntax-case x ()
82 ((_ lst obj)
83 #'(set! lst (cons obj lst))))))
84
85 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
86 ;;;;; CODE GENERATORS
87 ;; These functions generate scheme code for parsing PEGs.
88 ;; Conventions:
89 ;; accum: (all name body none)
90 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
91
92 ;; Code we generate will be defined in a function, and always has to test
93 ;; whether it's beyond the bounds of the string before it executes.
94 (define (cg-generic-lambda str strlen at code)
95 `(lambda (,str ,strlen ,at)
96 (if (>= ,at ,strlen)
97 #f
98 ,code)))
99 ;; The short name makes the formatting below much easier to read.
100 (define cggl cg-generic-lambda)
101
102 ;; Optimizations for CG-GENERIC-RET below...
103 (define *op-known-single-body* '(cg-string cg-peg-any cg-range))
104 ;; ...done with optimizations (could use more of these).
105
106 ;; Code we generate will have a certain return structure depending on how we're
107 ;; accumulating (the ACCUM variable).
108 (define (cg-generic-ret accum name body-uneval at)
109 (safe-bind
110 (body)
111 `(let ((,body ,body-uneval))
112 ,(cond
113 ((and (eq? accum 'all) name body)
114 `(list ,at
115 (cond
116 ((not (list? ,body)) (list ',name ,body))
117 ((null? ,body) ',name)
118 ((symbol? (car ,body)) (list ',name ,body))
119 (#t (cons ',name ,body)))))
120 ((and (eq? accum 'name) name)
121 `(list ,at ',name))
122 ((and (eq? accum 'body) body)
123 (cond
124 ((member name *op-known-single-body*)
125 `(list ,at ,body))
126 (#t `(list ,at
127 (cond
128 (((@@ (ice-9 peg) single?) ,body) (car ,body))
129 (#t ,body))))))
130 ((eq? accum 'none)
131 `(list ,at '()))
132 (#t
133 (begin
134 (pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at))
135 (pretty-print "Defaulting to accum of none.\n")
136 `(list ,at '())))))))
137 ;; The short name makes the formatting below much easier to read.
138 (define cggr cg-generic-ret)
139
140 ;; Generates code that matches a particular string.
141 ;; E.g.: (cg-string "abc" 'body)
142 (define (cg-string match accum)
143 (safe-bind
144 (str strlen at)
145 (let ((len (string-length match)))
146 (cggl str strlen at
147 `(if (string=? (substring ,str ,at (min (+ ,at ,len) ,strlen))
148 ,match)
149 ,(cggr accum 'cg-string match `(+ ,at ,len))
150 #f)))))
151
152 ;; Generates code for matching any character.
153 ;; E.g.: (cg-peg-any 'body)
154 (define (cg-peg-any accum)
155 (safe-bind
156 (str strlen at)
157 (cggl str strlen at
158 (cggr accum 'cg-peg-any `(substring ,str ,at (+ ,at 1)) `(+ ,at 1)))))
159
160 ;; Generates code for matching a range of characters between start and end.
161 ;; E.g.: (cg-range #\a #\z 'body)
162 (define (cg-range start end accum)
163 (safe-bind
164 (str strlen at c)
165 (cggl str strlen at
166 `(let ((,c (string-ref ,str ,at)))
167 (if (and
168 (char>=? ,c ,start)
169 (char<=? ,c ,end))
170 ,(cggr accum 'cg-range `(string ,c) `(+ ,at 1))
171 #f)))))
172
173 ;; Filters the accum argument to peg-sexp-compile for buildings like string
174 ;; literals (since we don't want to tag them with their name if we're doing an
175 ;; "all" accum).
176 (define (builtin-accum-filter accum)
177 (cond
178 ((eq? accum 'all) 'body)
179 ((eq? accum 'name) 'name)
180 ((eq? accum 'body) 'body)
181 ((eq? accum 'none) 'none)))
182 (define baf builtin-accum-filter)
183
184 ;; Takes a value, prints some debug output, and returns it.
185 (define (error-val val)
186 (begin
187 (pretty-print val)
188 (pretty-print "Inserting into code for debugging.\n")
189 val))
190
191 ;; Takes an arbitrary expressions and accumulation variable, then parses it.
192 ;; E.g.: (peg-sexp-compile '(and "abc" (or "-" (range #\a #\z))) 'all)
193 (define (peg-sexp-compile match accum)
194 (cond
195 ((string? match) (cg-string match (baf accum)))
196 ((symbol? match) ;; either peg-any or a nonterminal
197 (cond
198 ((eq? match 'peg-any) (cg-peg-any (baf accum)))
199 ;; if match is any other symbol it's a nonterminal, so just return it
200 (#t match)))
201 ((or (not (list? match)) (null? match))
202 ;; anything besides a string, symbol, or list is an error
203 (error-val `(peg-sexp-compile-error-1 ,match ,accum)))
204
205 ((eq? (car match) 'range) ;; range of characters (e.g. [a-z])
206 (cg-range (cadr match) (caddr match) (baf accum)))
207 ((eq? (car match) 'ignore) ;; match but don't parse
208 (peg-sexp-compile (cadr match) 'none))
209 ((eq? (car match) 'capture) ;; parse
210 (peg-sexp-compile (cadr match) 'body))
211 ((eq? (car match) 'peg) ;; embedded PEG string
212 (peg-string-compile (cadr match) (baf accum)))
213 ((eq? (car match) 'and) (cg-and (cdr match) (baf accum)))
214 ((eq? (car match) 'or) (cg-or (cdr match) (baf accum)))
215 ((eq? (car match) 'body)
216 (if (not (= (length match) 4))
217 (error-val `(peg-sexp-compile-error-2 ,match ,accum))
218 (apply cg-body (cons (baf accum) (cdr match)))))
219 (#t (error-val `(peg-sexp-compile-error-3 ,match ,accum)))))
220
221 ;;;;; Convenience macros for making sure things come out in a readable form.
222 ;; If SYM is a list of one element, return (car SYM), else return SYM.
223 (define-syntax single-filter
224 (lambda (x)
225 (syntax-case x ()
226 ((_ sym)
227 #'(if (single? sym) (car sym) sym)))))
228 ;; If OBJ is non-null, push it onto LST, otherwise do nothing.
229 (define-syntax push-not-null!
230 (lambda (x)
231 (syntax-case x ()
232 ((_ lst obj)
233 #'(if (not (null? obj)) (push! lst obj))))))
234
235 ;; Top-level function builder for AND. Reduces to a call to CG-AND-INT.
236 (define (cg-and arglst accum)
237 (safe-bind
238 (str strlen at body)
239 `(lambda (,str ,strlen ,at)
240 (let ((,body '()))
241 ,(cg-and-int arglst accum str strlen at body)))))
242
243 ;; Internal function builder for AND (calls itself).
244 (define (cg-and-int arglst accum str strlen at body)
245 (safe-bind
246 (res newat newbody)
247 (if (null? arglst)
248 (cggr accum 'cg-and `(reverse ,body) at) ;; base case
249 (let ((mf (peg-sexp-compile (car arglst) accum))) ;; match function
250 `(let ((,res (,mf ,str ,strlen ,at)))
251 (if (not ,res)
252 #f ;; if the match failed, the and failed
253 ;; otherwise update AT and BODY then recurse
254 (let ((,newat (car ,res))
255 (,newbody (cadr ,res)))
256 (set! ,at ,newat)
257 ((@@ (ice-9 peg) push-not-null!) ,body ((@@ (ice-9 peg) single-filter) ,newbody))
258 ,(cg-and-int (cdr arglst) accum str strlen at body))))))))
259
260 ;; Top-level function builder for OR. Reduces to a call to CG-OR-INT.
261 (define (cg-or arglst accum)
262 (safe-bind
263 (str strlen at body)
264 `(lambda (,str ,strlen ,at)
265 ,(cg-or-int arglst accum str strlen at body))))
266
267 ;; Internal function builder for OR (calls itself).
268 (define (cg-or-int arglst accum str strlen at body)
269 (safe-bind
270 (res)
271 (if (null? arglst)
272 #f ;; base case
273 (let ((mf (peg-sexp-compile (car arglst) accum)))
274 `(let ((,res (,mf ,str ,strlen ,at)))
275 (if ,res ;; if the match succeeds, we're done
276 ,(cggr accum 'cg-or `(cadr ,res) `(car ,res))
277 ,(cg-or-int (cdr arglst) accum str strlen at body)))))))
278
279 ;; Returns a block of code that tries to match MATCH, and on success updates AT
280 ;; and BODY, return #f on failure and #t on success.
281 (define (cg-body-test match accum str strlen at body)
282 (safe-bind
283 (at2-body2 at2 body2)
284 (let ((mf (peg-sexp-compile match accum)))
285 `(let ((,at2-body2 (,mf ,str ,strlen ,at)))
286 (if (or (not ,at2-body2) (= ,at (car ,at2-body2)))
287 #f
288 (let ((,at2 (car ,at2-body2))
289 (,body2 (cadr ,at2-body2)))
290 (set! ,at ,at2)
291 ((@@ (ice-9 peg) push-not-null!)
292 ,body
293 ((@@ (ice-9 peg) single-filter) ,body2))
294 #t))))))
295
296 ;; Returns a block of code that sees whether NUM wants us to try and match more
297 ;; given that we've already matched COUNT.
298 (define (cg-body-more num count)
299 (cond ((number? num) `(< ,count ,num))
300 ((eq? num '+) #t)
301 ((eq? num '*) #t)
302 ((eq? num '?) `(< ,count 1))
303 (#t (error-val `(cg-body-more-error ,num ,count)))))
304
305 ;; Returns a function that takes a paramter indicating whether or not the match
306 ;; was succesful and returns what the body expression should return.
307 (define (cg-body-ret accum type name body at at2)
308 (safe-bind
309 (success)
310 `(lambda (,success)
311 ,(cond ((eq? type '!) `(if ,success #f ,(cggr accum name ''() at)))
312 ((eq? type '&) `(if ,success ,(cggr accum name ''() at) #f))
313 ((eq? type 'lit)
314 `(if ,success ,(cggr accum name `(reverse ,body) at2) #f))
315 (#t (error-val
316 `(cg-body-ret-error ,type ,accum ,name ,body ,at ,at2)))))))
317
318 ;; Returns a block of code that sees whether COUNT satisfies the constraints of
319 ;; NUM.
320 (define (cg-body-success num count)
321 (cond ((number? num) `(= ,count ,num))
322 ((eq? num '+) `(>= ,count 1))
323 ((eq? num '*) #t)
324 ((eq? num '?) `(<= ,count 1))
325 (#t `(cg-body-success-error ,num))))
326
327 ;; Returns a function that parses a BODY element.
328 (define (cg-body accum type match num)
329 (safe-bind
330 (str strlen at at2 count body)
331 `(lambda (,str ,strlen ,at)
332 (let ((,at2 ,at) (,count 0) (,body '()))
333 (while (and ,(cg-body-test match accum str strlen at2 body)
334 (set! ,count (+ ,count 1))
335 ,(cg-body-more num count)))
336 (,(cg-body-ret accum type 'cg-body body at at2)
337 ,(cg-body-success num count))))))
338
339 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
340 ;;;;; FOR DEFINING AND USING NONTERMINALS
341 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342
343 ;; The results of parsing using a nonterminal are cached. Think of it like a
344 ;; hash with no conflict resolution. Process for deciding on the cache size
345 ;; wasn't very scientific; just ran the benchmarks and stopped a little after
346 ;; the point of diminishing returns on my box.
347 (define *cache-size* 512)
348
349 ;; Defines a new nonterminal symbol accumulating with ACCUM.
350 (define-syntax define-nonterm
351 (lambda (x)
352 (syntax-case x ()
353 ((_ sym accum match)
354 (let ((matchf (peg-sexp-compile (syntax->datum #'match)
355 (syntax->datum #'accum)))
356 (symsym (syntax->datum #'sym))
357 (accumsym (syntax->datum #'accum))
358 (c (datum->syntax x (gensym))));; the cache
359 ;; CODE is the code to parse the string if the result isn't cached.
360 (let ((code
361 (safe-bind
362 (str strlen at res body)
363 `(lambda (,str ,strlen ,at)
364 (let ((,res (,matchf ,str ,strlen ,at)))
365 ;; Try to match the nonterminal.
366 (if ,res
367 ;; If we matched, do some post-processing to figure out
368 ;; what data to propagate upward.
369 (let ((,at (car ,res))
370 (,body (cadr ,res)))
371 ,(cond
372 ((eq? accumsym 'name)
373 `(list ,at ',symsym))
374 ((eq? accumsym 'all)
375 `(list (car ,res)
376 (cond
377 ((not (list? ,body))
378 (list ',symsym ,body))
379 ((null? ,body) ',symsym)
380 ((symbol? (car ,body))
381 (list ',symsym ,body))
382 (#t (cons ',symsym ,body)))))
383 ((eq? accumsym 'none) `(list (car ,res) '()))
384 (#t (begin res))))
385 ;; If we didn't match, just return false.
386 #f))))))
387 #`(begin
388 (define #,c (make-vector *cache-size* #f));; the cache
389 (define (sym str strlen at)
390 (let* ((vref (vector-ref #,c (modulo at *cache-size*))))
391 ;; Check to see whether the value is cached.
392 (if (and vref (eq? (car vref) str) (= (cadr vref) at))
393 (caddr vref);; If it is return it.
394 (let ((fres ;; Else calculate it and cache it.
395 (#,(datum->syntax x code) str strlen at)))
396 (vector-set! #,c (modulo at *cache-size*)
397 (list str at fres))
398 fres))))
399
400 ;; Store the code in case people want to debug.
401 (set-symbol-property!
402 'sym 'code #,(datum->syntax x (list 'quote code)))
403 sym)))))))
404
405 ;; Gets the code corresponding to NONTERM
406 (define-syntax get-code
407 (lambda (x)
408 (syntax-case x ()
409 ((_ nonterm)
410 #`(pretty-print (symbol-property 'nonterm 'code))))))
411
412 ;; Parses STRING using NONTERM
413 (define (peg-parse nonterm string)
414 ;; We copy the string before using it because it might have been modified
415 ;; in-place since the last time it was parsed, which would invalidate the
416 ;; cache. Guile uses copy-on-write for strings, so this is fast.
417 (let ((res (nonterm (string-copy string) (string-length string) 0)))
418 (if (not res)
419 #f
420 (make-prec 0 (car res) string (string-collapse (cadr res))))))
421
422 ;; Searches through STRING for something that parses to PEG-MATCHER. Think
423 ;; regexp search.
424 (define-syntax peg-match
425 (lambda (x)
426 (syntax-case x ()
427 ((_ peg-matcher string-uncopied)
428 (let ((pmsym (syntax->datum #'peg-matcher)))
429 (let ((peg-sexp-compile
430 (if (string? pmsym)
431 (peg-string-compile pmsym 'body)
432 (peg-sexp-compile pmsym 'body))))
433 ;; We copy the string before using it because it might have been
434 ;; modified in-place since the last time it was parsed, which would
435 ;; invalidate the cache. Guile uses copy-on-write for strings, so
436 ;; this is fast.
437 #`(let ((string (string-copy string-uncopied))
438 (strlen (string-length string-uncopied))
439 (at 0))
440 (let ((ret ((@@ (ice-9 peg) until-works)
441 (or (>= at strlen)
442 (#,(datum->syntax x peg-sexp-compile)
443 string strlen at))
444 (set! at (+ at 1)))))
445 (if (eq? ret #t) ;; (>= at strlen) succeeded
446 #f
447 (let ((end (car ret))
448 (match (cadr ret)))
449 (make-prec
450 at end string
451 (string-collapse match))))))))))))
452
453 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
454 ;;;;; POST-PROCESSING FUNCTIONS (TO CANONICALIZE MATCH TREES)
455 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
456
457 ;; Is everything in LST true?
458 (define (andlst lst)
459 (or (null? lst)
460 (and (car lst) (andlst (cdr lst)))))
461
462 ;; Is LST a list of strings?
463 (define (string-list? lst)
464 (and (list? lst) (not (null? lst))
465 (andlst (map string? lst))))
466
467 ;; Groups all strings that are next to each other in LST. Used in
468 ;; STRING-COLLAPSE.
469 (define (string-group lst)
470 (if (not (list? lst))
471 lst
472 (if (null? lst)
473 '()
474 (let ((next (string-group (cdr lst))))
475 (if (not (string? (car lst)))
476 (cons (car lst) next)
477 (if (and (not (null? next))
478 (list? (car next))
479 (string? (caar next)))
480 (cons (cons (car lst) (car next)) (cdr next))
481 (cons (list (car lst)) next)))))))
482
483
484 ;; Collapses all the string in LST.
485 ;; ("a" "b" (c d) "e" "f") -> ("ab" (c d) "ef")
486 (define (string-collapse lst)
487 (if (list? lst)
488 (let ((res (map (lambda (x) (if (string-list? x)
489 (apply string-append x)
490 x))
491 (string-group (map string-collapse lst)))))
492 (if (single? res) (car res) res))
493 lst))
494
495 ;; If LST is an atom, return (list LST), else return LST.
496 (define (mklst lst)
497 (if (not (list? lst)) (list lst) lst))
498
499 ;; Takes a list and "flattens" it, using the predicate TST to know when to stop
500 ;; instead of terminating on atoms (see tutorial).
501 (define (context-flatten tst lst)
502 (if (or (not (list? lst)) (null? lst))
503 lst
504 (if (tst lst)
505 (list lst)
506 (apply append
507 (map (lambda (x) (mklst (context-flatten tst x)))
508 lst)))))
509
510 ;; Takes a list and "flattens" it, using the list of keywords KEYWORD-LST to
511 ;; know when to stop at (see tutorial).
512 (define (keyword-flatten keyword-lst lst)
513 (context-flatten
514 (lambda (x)
515 (if (or (not (list? x)) (null? x))
516 #t
517 (member (car x) keyword-lst)))
518 lst))
519
520 ;; Gets the left-hand depth of a list.
521 (define (depth lst)
522 (if (or (not (list? lst)) (null? lst))
523 0
524 (+ 1 (depth (car lst)))))
525
526 ;; Trims characters off the front and end of STR.
527 ;; (trim-1chars "'ab'") -> "ab"
528 (define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
529
530 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
531 ;;;;; Parse string PEGs using sexp PEGs.
532 ;; See the variable PEG-AS-PEG for an easier-to-read syntax.
533 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
534
535 ;; Grammar for PEGs in PEG grammar.
536 (define peg-as-peg
537 "grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
538 pattern <-- alternative (SLASH sp alternative)*
539 alternative <-- ([!&]? sp suffix)+
540 suffix <-- primary ([*+?] sp)*
541 primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<'
542 literal <-- ['] (!['] .)* ['] sp
543 charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
544 CCrange <-- . '-' .
545 CCsingle <-- .
546 nonterminal <-- [a-zA-Z0-9-]+ sp
547 sp < [ \t\n]*
548 SLASH < '/'
549 LB < '['
550 RB < ']'
551 ")
552
553 (define-nonterm peg-grammar all
554 (body lit (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern) +))
555 (define-nonterm peg-pattern all
556 (and peg-alternative
557 (body lit (and (ignore "/") peg-sp peg-alternative) *)))
558 (define-nonterm peg-alternative all
559 (body lit (and (body lit (or "!" "&") ?) peg-sp peg-suffix) +))
560 (define-nonterm peg-suffix all
561 (and peg-primary (body lit (and (or "*" "+" "?") peg-sp) *)))
562 (define-nonterm peg-primary all
563 (or (and "(" peg-sp peg-pattern ")" peg-sp)
564 (and "." peg-sp)
565 peg-literal
566 peg-charclass
567 (and peg-nonterminal (body ! "<" 1))))
568 (define-nonterm peg-literal all
569 (and "'" (body lit (and (body ! "'" 1) peg-any) *) "'" peg-sp))
570 (define-nonterm peg-charclass all
571 (and (ignore "[")
572 (body lit (and (body ! "]" 1)
573 (or charclass-range charclass-single)) *)
574 (ignore "]")
575 peg-sp))
576 (define-nonterm charclass-range all (and peg-any "-" peg-any))
577 (define-nonterm charclass-single all peg-any)
578 (define-nonterm peg-nonterminal all
579 (and (body lit (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-") +) peg-sp))
580 (define-nonterm peg-sp none
581 (body lit (or " " "\t" "\n") *))
582
583 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
584 ;;;;; PARSE STRING PEGS
585 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
586
587 ;; Pakes a string representing a PEG grammar and defines all the nonterminals in
588 ;; it as the associated PEGs.
589 (define (peg-parser str)
590 (let ((parsed (peg-parse peg-grammar str)))
591 (if (not parsed)
592 (begin
593 ;; (pretty-print "Invalid PEG grammar!\n")
594 #f)
595 (let ((lst (peg:tree parsed)))
596 (cond
597 ((or (not (list? lst)) (null? lst))
598 lst)
599 ((eq? (car lst) 'peg-grammar)
600 (cons 'begin (map (lambda (x) (peg-parse-nonterm x))
601 (context-flatten (lambda (lst) (<= (depth lst) 2))
602 (cdr lst))))))))))
603
604 ;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and
605 ;; defines all the appropriate nonterminals.
606 (define-syntax define-grammar
607 (lambda (x)
608 (syntax-case x ()
609 ((_ str)
610 (datum->syntax x (peg-parser (syntax->datum #'str)))))))
611 (define define-grammar-f peg-parser)
612
613 ;; Parse a nonterminal and pattern listed in LST.
614 (define (peg-parse-nonterm lst)
615 (let ((nonterm (car lst))
616 (grabber (cadr lst))
617 (pattern (caddr lst)))
618 `(define-nonterm ,(string->symbol (cadr nonterm))
619 ,(cond
620 ((string=? grabber "<--") 'all)
621 ((string=? grabber "<-") 'body)
622 (#t 'none))
623 ,(compressor (peg-parse-pattern pattern)))))
624
625 ;; Parse a pattern.
626 (define (peg-parse-pattern lst)
627 (cons 'or (map peg-parse-alternative
628 (context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
629 (cdr lst)))))
630
631 ;; Parse an alternative.
632 (define (peg-parse-alternative lst)
633 (cons 'and (map peg-parse-body
634 (context-flatten (lambda (x) (or (string? (car x))
635 (eq? (car x) 'peg-suffix)))
636 (cdr lst)))))
637
638 ;; Parse a body.
639 (define (peg-parse-body lst)
640 (let ((suffix '())
641 (front 'lit))
642 (cond
643 ((eq? (car lst) 'peg-suffix)
644 (set! suffix lst))
645 ((string? (car lst))
646 (begin (set! front (string->symbol (car lst)))
647 (set! suffix (cadr lst))))
648 (#t `(peg-parse-body-fail ,lst)))
649 `(body ,front ,@(peg-parse-suffix suffix))))
650
651 ;; Parse a suffix.
652 (define (peg-parse-suffix lst)
653 (list (peg-parse-primary (cadr lst))
654 (if (null? (cddr lst))
655 1
656 (string->symbol (caddr lst)))))
657
658 ;; Parse a primary.
659 (define (peg-parse-primary lst)
660 (let ((el (cadr lst)))
661 (cond
662 ((list? el)
663 (cond
664 ((eq? (car el) 'peg-literal)
665 (peg-parse-literal el))
666 ((eq? (car el) 'peg-charclass)
667 (peg-parse-charclass el))
668 ((eq? (car el) 'peg-nonterminal)
669 (string->symbol (cadr el)))))
670 ((string? el)
671 (cond
672 ((equal? el "(")
673 (peg-parse-pattern (caddr lst)))
674 ((equal? el ".")
675 'peg-any)
676 (#t `(peg-parse-any unknown-string ,lst))))
677 (#t `(peg-parse-any unknown-el ,lst)))))
678
679 ;; Parses a literal.
680 (define (peg-parse-literal lst) (trim-1chars (cadr lst)))
681
682 ;; Parses a charclass.
683 (define (peg-parse-charclass lst)
684 (cons 'or
685 (map
686 (lambda (cc)
687 (cond
688 ((eq? (car cc) 'charclass-range)
689 `(range ,(string-ref (cadr cc) 0) ,(string-ref (cadr cc) 2)))
690 ((eq? (car cc) 'charclass-single)
691 (cadr cc))))
692 (context-flatten
693 (lambda (x) (or (eq? (car x) 'charclass-range)
694 (eq? (car x) 'charclass-single)))
695 (cdr lst)))))
696
697 ;; Compresses a list to save the optimizer work.
698 ;; e.g. (or (and a)) -> a
699 (define (compressor lst)
700 (if (or (not (list? lst)) (null? lst))
701 lst
702 (cond
703 ((and (or (eq? (car lst) 'or) (eq? (car lst) 'and))
704 (null? (cddr lst)))
705 (compressor (cadr lst)))
706 ((and (eq? (car lst) 'body)
707 (eq? (cadr lst) 'lit)
708 (eq? (cadddr lst) 1))
709 (compressor (caddr lst)))
710 (#t (map compressor lst)))))
711
712 ;; Builds a lambda-expressions for the pattern STR using accum.
713 (define (peg-string-compile str accum)
714 (peg-sexp-compile
715 (compressor (peg-parse-pattern (peg:tree (peg-parse peg-pattern str))))
716 accum))
717
718 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
719 ;;;;; PMATCH STRUCTURE MUNGING
720 ;; Pretty self-explanatory.
721 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
722
723 (define prec
724 (make-record-type "peg" '(start end string tree)))
725 (define make-prec
726 (record-constructor prec '(start end string tree)))
727 (define (peg:start pm)
728 (if pm ((record-accessor prec 'start) pm) #f))
729 (define (peg:end pm)
730 (if pm ((record-accessor prec 'end) pm) #f))
731 (define (peg:string pm)
732 (if pm ((record-accessor prec 'string) pm) #f))
733 (define (peg:tree pm)
734 (if pm ((record-accessor prec 'tree) pm) #f))
735 (define (peg:substring pm)
736 (if pm (substring (peg:string pm) (peg:start pm) (peg:end pm)) #f))
737 (define peg-record? (record-predicate prec))
738
739 )
740