1 ;;; EARLEY -- Earley's parser, written by Marc Feeley.
3 ; $Id: earley.sch,v 1.2 1999/07/12 18:05:19 lth Exp $
4 ; 990708 / lth -- changed 'main' to 'earley-benchmark'.
6 ; (make-parser grammar lexer) is used to create a parser from the grammar
7 ; description `grammar' and the lexer function `lexer'.
9 ; A grammar is a list of definitions. Each definition defines a non-terminal
10 ; by a set of rules. Thus a definition has the form: (nt rule1 rule2...).
11 ; A given non-terminal can only be defined once. The first non-terminal
12 ; defined is the grammar's goal. Each rule is a possibly empty list of
13 ; non-terminals. Thus a rule has the form: (nt1 nt2...). A non-terminal
14 ; can be any scheme value. Note that all grammar symbols are treated as
15 ; non-terminals. This is fine though because the lexer will be outputing
18 ; The lexer defines what a token is and the mapping between tokens and
19 ; the grammar's non-terminals. It is a function of one argument, the input,
20 ; that returns the list of tokens corresponding to the input. Each token is
21 ; represented by a list. The first element is some `user-defined' information
22 ; associated with the token and the rest represents the token's class(es) (as a
23 ; list of non-terminals that this token corresponds to).
25 ; The result of `make-parser' is a function that parses the single input it
26 ; is given into the grammar's goal. The result is a `parse' which can be
27 ; manipulated with the procedures: `parse->parsed?', `parse->trees'
28 ; and `parse->nb-trees' (see below).
30 ; Let's assume that we want a parser for the grammar
36 ; and that the input to the parser is a string of characters. Also, assume we
37 ; would like to map the characters `x', `y', `+' and `=' into the corresponding
38 ; non-terminals in the grammar. Such a parser could be created with
48 ; (list char ; user-info = the character itself
54 ; (else (fatal-error "lexer error")))))
55 ; (string->list str)))
58 ; An alternative definition (that does not check for lexical errors) is
66 ; (lambda (str) (map (lambda (char) (list char char)) (string->list str)))
69 ; To help with the rest of the discussion, here are a few definitions:
71 ; An input pointer (for an input of `n' tokens) is a value between 0 and `n'.
72 ; It indicates a point between two input tokens (0 = beginning, `n' = end).
73 ; For example, if `n' = 4, there are 5 input pointers:
75 ; input token1 token2 token3 token4
76 ; input pointers 0 1 2 3 4
78 ; A configuration indicates the extent to which a given rule is parsed (this
79 ; is the common `dot notation'). For simplicity, a configuration is
80 ; represented as an integer, with successive configurations in the same
81 ; rule associated with successive integers. It is assumed that the grammar
82 ; has been extended with rules to aid scanning. These rules are of the
83 ; form `nt ->', and there is one such rule for every non-terminal. Note
84 ; that these rules are special because they only apply when the corresponding
85 ; non-terminal is returned by the lexer.
87 ; A configuration set is a configuration grouped with the set of input pointers
88 ; representing where the head non-terminal of the configuration was predicted.
90 ; Here are the rules and configurations for the grammar given above:
99 ; 3 > special rules (for scanning)
117 ; Starters of the non-terminal `nt' are configurations that are leftmost
118 ; in a non-special rule for `nt'. Enders of the non-terminal `nt' are
119 ; configurations that are rightmost in any rule for `nt'. Predictors of the
120 ; non-terminal `nt' are configurations that are directly to the left of `nt'
123 ; For the grammar given above,
125 ; Starters of V = (17 20)
126 ; Enders of V = (5 19 20)
127 ; Predictors of V = (15 17)
129 (define (make-parser grammar lexer)
131 (define (non-terminals grammar) ; return vector of non-terminals in grammar
133 (define (add-nt nt nts)
134 (if (member nt nts) nts (cons nt nts))) ; use equal? for equality tests
136 (let def-loop ((defs grammar) (nts '()))
138 (let* ((def (car defs))
140 (let rule-loop ((rules (cdr def))
141 (nts (add-nt head nts)))
143 (let ((rule (car rules)))
144 (let loop ((l rule) (nts nts))
147 (loop (cdr l) (add-nt nt nts)))
148 (rule-loop (cdr rules) nts))))
149 (def-loop (cdr defs) nts))))
150 (list->vector (reverse nts))))) ; goal non-terminal must be at index 0
152 (define (ind nt nts) ; return index of non-terminal `nt' in `nts'
153 (let loop ((i (- (vector-length nts) 1)))
155 (if (equal? (vector-ref nts i) nt) i (loop (- i 1)))
158 (define (nb-configurations grammar) ; return nb of configurations in grammar
159 (let def-loop ((defs grammar) (nb-confs 0))
161 (let ((def (car defs)))
162 (let rule-loop ((rules (cdr def)) (nb-confs nb-confs))
164 (let ((rule (car rules)))
165 (let loop ((l rule) (nb-confs nb-confs))
167 (loop (cdr l) (+ nb-confs 1))
168 (rule-loop (cdr rules) (+ nb-confs 1)))))
169 (def-loop (cdr defs) nb-confs))))
172 ; First, associate a numeric identifier to every non-terminal in the
173 ; grammar (with the goal non-terminal associated with 0).
175 ; So, for the grammar given above we get:
177 ; s -> 0 x -> 1 = -> 4 e ->3 + -> 4 v -> 5 y -> 6
179 (let* ((nts (non-terminals grammar)) ; id map = list of non-terms
180 (nb-nts (vector-length nts)) ; the number of non-terms
181 (nb-confs (+ (nb-configurations grammar) nb-nts)) ; the nb of confs
182 (starters (make-vector nb-nts '())) ; starters for every non-term
183 (enders (make-vector nb-nts '())) ; enders for every non-term
184 (predictors (make-vector nb-nts '())) ; predictors for every non-term
185 (steps (make-vector nb-confs #f)) ; what to do in a given conf
186 (names (make-vector nb-confs #f))) ; name of rules
188 (define (setup-tables grammar nts starters enders predictors steps names)
190 (define (add-conf conf nt nts class)
191 (let ((i (ind nt nts)))
192 (vector-set! class i (cons conf (vector-ref class i)))))
194 (let ((nb-nts (vector-length nts)))
196 (let nt-loop ((i (- nb-nts 1)))
199 (vector-set! steps i (- i nb-nts))
200 (vector-set! names i (list (vector-ref nts i) 0))
201 (vector-set! enders i (list i))
204 (let def-loop ((defs grammar) (conf (vector-length nts)))
206 (let* ((def (car defs))
208 (let rule-loop ((rules (cdr def)) (conf conf) (rule-num 1))
210 (let ((rule (car rules)))
211 (vector-set! names conf (list head rule-num))
212 (add-conf conf head nts starters)
213 (let loop ((l rule) (conf conf))
216 (vector-set! steps conf (ind nt nts))
217 (add-conf conf nt nts predictors)
218 (loop (cdr l) (+ conf 1)))
220 (vector-set! steps conf (- (ind head nts) nb-nts))
221 (add-conf conf head nts enders)
222 (rule-loop (cdr rules) (+ conf 1) (+ rule-num 1))))))
223 (def-loop (cdr defs) conf))))))))
225 ; Now, for each non-terminal, compute the starters, enders and predictors and
226 ; the names and steps tables.
228 (setup-tables grammar nts starters enders predictors steps names)
230 ; Build the parser description
232 (let ((parser-descr (vector lexer
241 (define (ind nt nts) ; return index of non-terminal `nt' in `nts'
242 (let loop ((i (- (vector-length nts) 1)))
244 (if (equal? (vector-ref nts i) nt) i (loop (- i 1)))
247 (define (comp-tok tok nts) ; transform token to parsing format
248 (let loop ((l1 (cdr tok)) (l2 '()))
250 (let ((i (ind (car l1) nts)))
252 (loop (cdr l1) (cons i l2))
254 (cons (car tok) (reverse l2)))))
256 (define (input->tokens input lexer nts)
257 (list->vector (map (lambda (tok) (comp-tok tok nts)) (lexer input))))
259 (define (make-states nb-toks nb-confs)
260 (let ((states (make-vector (+ nb-toks 1) #f)))
261 (let loop ((i nb-toks))
263 (let ((v (make-vector (+ nb-confs 1) #f)))
265 (vector-set! states i v)
269 (define (conf-set-get state conf)
270 (vector-ref state (+ conf 1)))
272 (define (conf-set-get* state state-num conf)
273 (let ((conf-set (conf-set-get state conf)))
276 (let ((conf-set (make-vector (+ state-num 6) #f)))
277 (vector-set! conf-set 1 -3) ; old elems tail (points to head)
278 (vector-set! conf-set 2 -1) ; old elems head
279 (vector-set! conf-set 3 -1) ; new elems tail (points to head)
280 (vector-set! conf-set 4 -1) ; new elems head
281 (vector-set! state (+ conf 1) conf-set)
284 (define (conf-set-merge-new! conf-set)
285 (vector-set! conf-set
286 (+ (vector-ref conf-set 1) 5)
287 (vector-ref conf-set 4))
288 (vector-set! conf-set 1 (vector-ref conf-set 3))
289 (vector-set! conf-set 3 -1)
290 (vector-set! conf-set 4 -1))
292 (define (conf-set-head conf-set)
293 (vector-ref conf-set 2))
295 (define (conf-set-next conf-set i)
296 (vector-ref conf-set (+ i 5)))
298 (define (conf-set-member? state conf i)
299 (let ((conf-set (vector-ref state (+ conf 1))))
301 (conf-set-next conf-set i)
304 (define (conf-set-adjoin state conf-set conf i)
305 (let ((tail (vector-ref conf-set 3))) ; put new element at tail
306 (vector-set! conf-set (+ i 5) -1)
307 (vector-set! conf-set (+ tail 5) i)
308 (vector-set! conf-set 3 i)
311 (vector-set! conf-set 0 (vector-ref state 0))
312 (vector-set! state 0 conf)))))
314 (define (conf-set-adjoin* states state-num l i)
315 (let ((state (vector-ref states state-num)))
318 (let* ((conf (car l1))
319 (conf-set (conf-set-get* state state-num conf)))
320 (if (not (conf-set-next conf-set i))
322 (conf-set-adjoin state conf-set conf i)
324 (loop (cdr l1))))))))
326 (define (conf-set-adjoin** states states* state-num conf i)
327 (let ((state (vector-ref states state-num)))
328 (if (conf-set-member? state conf i)
329 (let* ((state* (vector-ref states* state-num))
330 (conf-set* (conf-set-get* state* state-num conf)))
331 (if (not (conf-set-next conf-set* i))
332 (conf-set-adjoin state* conf-set* conf i))
336 (define (conf-set-union state conf-set conf other-set)
337 (let loop ((i (conf-set-head other-set)))
339 (if (not (conf-set-next conf-set i))
341 (conf-set-adjoin state conf-set conf i)
342 (loop (conf-set-next other-set i)))
343 (loop (conf-set-next other-set i))))))
345 (define (forw states state-num starters enders predictors steps nts)
347 (define (predict state state-num conf-set conf nt starters enders)
349 ; add configurations which start the non-terminal `nt' to the
352 (let loop1 ((l (vector-ref starters nt)))
354 (let* ((starter (car l))
355 (starter-set (conf-set-get* state state-num starter)))
356 (if (not (conf-set-next starter-set state-num))
358 (conf-set-adjoin state starter-set starter state-num)
362 ; check for possible completion of the non-terminal `nt' to the
365 (let loop2 ((l (vector-ref enders nt)))
367 (let ((ender (car l)))
368 (if (conf-set-member? state ender state-num)
369 (let* ((next (+ conf 1))
370 (next-set (conf-set-get* state state-num next)))
371 (conf-set-union state next-set next conf-set)
375 (define (reduce states state state-num conf-set head preds)
377 ; a non-terminal is now completed so check for reductions that
378 ; are now possible at the configurations `preds'
380 (let loop1 ((l preds))
382 (let ((pred (car l)))
383 (let loop2 ((i head))
385 (let ((pred-set (conf-set-get (vector-ref states i) pred)))
387 (let* ((next (+ pred 1))
388 (next-set (conf-set-get* state state-num next)))
389 (conf-set-union state next-set next pred-set)))
390 (loop2 (conf-set-next conf-set i)))
391 (loop1 (cdr l))))))))
393 (let ((state (vector-ref states state-num))
394 (nb-nts (vector-length nts)))
396 (let ((conf (vector-ref state 0)))
398 (let* ((step (vector-ref steps conf))
399 (conf-set (vector-ref state (+ conf 1)))
400 (head (vector-ref conf-set 4)))
401 (vector-set! state 0 (vector-ref conf-set 0))
402 (conf-set-merge-new! conf-set)
404 (predict state state-num conf-set conf step starters enders)
405 (let ((preds (vector-ref predictors (+ step nb-nts))))
406 (reduce states state state-num conf-set head preds)))
409 (define (forward starters enders predictors steps nts toks)
410 (let* ((nb-toks (vector-length toks))
411 (nb-confs (vector-length steps))
412 (states (make-states nb-toks nb-confs))
413 (goal-starters (vector-ref starters 0)))
414 (conf-set-adjoin* states 0 goal-starters 0) ; predict goal
415 (forw states 0 starters enders predictors steps nts)
418 (let ((tok-nts (cdr (vector-ref toks i))))
419 (conf-set-adjoin* states (+ i 1) tok-nts i) ; scan token
420 (forw states (+ i 1) starters enders predictors steps nts)
424 (define (produce conf i j enders steps toks states states* nb-nts)
425 (let ((prev (- conf 1)))
426 (if (and (>= conf nb-nts) (>= (vector-ref steps prev) 0))
427 (let loop1 ((l (vector-ref enders (vector-ref steps prev))))
429 (let* ((ender (car l))
430 (ender-set (conf-set-get (vector-ref states j)
433 (let loop2 ((k (conf-set-head ender-set)))
437 (conf-set-adjoin** states states* k prev i)
438 (conf-set-adjoin** states states* j ender k))
439 (loop2 (conf-set-next ender-set k)))
441 (loop1 (cdr l)))))))))
443 (define (back states states* state-num enders steps nb-nts toks)
444 (let ((state* (vector-ref states* state-num)))
446 (let ((conf (vector-ref state* 0)))
448 (let* ((conf-set (vector-ref state* (+ conf 1)))
449 (head (vector-ref conf-set 4)))
450 (vector-set! state* 0 (vector-ref conf-set 0))
451 (conf-set-merge-new! conf-set)
452 (let loop2 ((i head))
455 (produce conf i state-num enders steps
456 toks states states* nb-nts)
457 (loop2 (conf-set-next conf-set i)))
460 (define (backward states enders steps nts toks)
461 (let* ((nb-toks (vector-length toks))
462 (nb-confs (vector-length steps))
463 (nb-nts (vector-length nts))
464 (states* (make-states nb-toks nb-confs))
465 (goal-enders (vector-ref enders 0)))
466 (let loop1 ((l goal-enders))
468 (let ((conf (car l)))
469 (conf-set-adjoin** states states* nb-toks conf 0)
471 (let loop2 ((i nb-toks))
474 (back states states* i enders steps nb-nts toks)
478 (define (parsed? nt i j nts enders states)
479 (let ((nt* (ind nt nts)))
481 (let ((nb-nts (vector-length nts)))
482 (let loop ((l (vector-ref enders nt*)))
484 (let ((conf (car l)))
485 (if (conf-set-member? (vector-ref states j) conf i)
491 (define (deriv-trees conf i j enders steps names toks states nb-nts)
492 (let ((name (vector-ref names conf)))
494 (if name ; `conf' is at the start of a rule (either special or not)
496 (list (list name (car (vector-ref toks i))))
499 (let ((prev (- conf 1)))
500 (let loop1 ((l1 (vector-ref enders (vector-ref steps prev)))
503 (let* ((ender (car l1))
504 (ender-set (conf-set-get (vector-ref states j)
507 (let loop2 ((k (conf-set-head ender-set)) (l2 l2))
510 (conf-set-member? (vector-ref states k)
513 (deriv-trees prev i k enders steps names
516 (deriv-trees ender k j enders steps names
517 toks states nb-nts)))
518 (let loop3 ((l3 ender-trees) (l2 l2))
520 (let ((ender-tree (list (car l3))))
521 (let loop4 ((l4 prev-trees) (l2 l2))
524 (cons (append (car l4)
527 (loop3 (cdr l3) l2))))
528 (loop2 (conf-set-next ender-set k) l2))))
529 (loop2 (conf-set-next ender-set k) l2))
530 (loop1 (cdr l1) l2)))
531 (loop1 (cdr l1) l2)))
534 (define (deriv-trees* nt i j nts enders steps names toks states)
535 (let ((nt* (ind nt nts)))
537 (let ((nb-nts (vector-length nts)))
538 (let loop ((l (vector-ref enders nt*)) (trees '()))
540 (let ((conf (car l)))
541 (if (conf-set-member? (vector-ref states j) conf i)
543 (append (deriv-trees conf i j enders steps names
546 (loop (cdr l) trees)))
550 (define (nb-deriv-trees conf i j enders steps toks states nb-nts)
551 (let ((prev (- conf 1)))
552 (if (or (< conf nb-nts) (< (vector-ref steps prev) 0))
554 (let loop1 ((l (vector-ref enders (vector-ref steps prev)))
557 (let* ((ender (car l))
558 (ender-set (conf-set-get (vector-ref states j)
561 (let loop2 ((k (conf-set-head ender-set)) (n n))
564 (conf-set-member? (vector-ref states k)
567 (nb-deriv-trees prev i k enders steps
570 (nb-deriv-trees ender k j enders steps
571 toks states nb-nts)))
572 (loop2 (conf-set-next ender-set k)
573 (+ n (* nb-prev-trees nb-ender-trees))))
574 (loop2 (conf-set-next ender-set k) n))
579 (define (nb-deriv-trees* nt i j nts enders steps toks states)
580 (let ((nt* (ind nt nts)))
582 (let ((nb-nts (vector-length nts)))
583 (let loop ((l (vector-ref enders nt*)) (nb-trees 0))
585 (let ((conf (car l)))
586 (if (conf-set-member? (vector-ref states j) conf i)
588 (+ (nb-deriv-trees conf i j enders steps
591 (loop (cdr l) nb-trees)))
595 (let* ((lexer (vector-ref parser-descr 0))
596 (nts (vector-ref parser-descr 1))
597 (starters (vector-ref parser-descr 2))
598 (enders (vector-ref parser-descr 3))
599 (predictors (vector-ref parser-descr 4))
600 (steps (vector-ref parser-descr 5))
601 (names (vector-ref parser-descr 6))
602 (toks (input->tokens input lexer nts)))
611 (backward (forward starters enders predictors steps nts toks)
612 enders steps nts toks)
615 nb-deriv-trees*))))))
617 (define (parse->parsed? parse nt i j)
618 (let* ((nts (vector-ref parse 0))
619 (enders (vector-ref parse 2))
620 (states (vector-ref parse 7))
621 (parsed? (vector-ref parse 8)))
622 (parsed? nt i j nts enders states)))
624 (define (parse->trees parse nt i j)
625 (let* ((nts (vector-ref parse 0))
626 (enders (vector-ref parse 2))
627 (steps (vector-ref parse 4))
628 (names (vector-ref parse 5))
629 (toks (vector-ref parse 6))
630 (states (vector-ref parse 7))
631 (deriv-trees* (vector-ref parse 9)))
632 (deriv-trees* nt i j nts enders steps names toks states)))
634 (define (parse->nb-trees parse nt i j)
635 (let* ((nts (vector-ref parse 0))
636 (enders (vector-ref parse 2))
637 (steps (vector-ref parse 4))
638 (toks (vector-ref parse 6))
639 (states (vector-ref parse 7))
640 (nb-deriv-trees* (vector-ref parse 10)))
641 (nb-deriv-trees* nt i j nts enders steps toks states)))
644 (let ((p (make-parser '( (s (a) (s s)) )
645 (lambda (l) (map (lambda (x) (list x x)) l)))))
646 (let ((x (p (vector->list (make-vector k 'a)))))
647 (length (parse->trees x 's 0 k)))))
649 (define (earley-benchmark . args)
650 (let ((k (if (null? args) 9 (car args))))