7 Jane Street Holding, LLC
9 email: mmottl\@janestcapital.com
10 WWW: http://www.janestcapital.com/ocaml
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 2 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, write to the Free Software
24 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 (* Sexp: Module for handling S-expressions (I/O, etc.) *)
33 (* Default indentation level for human-readable conversions *)
35 let default_indent = ref 1
37 (* Escaping of strings used as atoms in S-expressions *)
39 let is_special_char c
=
40 c
<= ' '
|| c
= '
"' || c = '(' || c = ')' || c = ';' || c = '\\'
43 let len = String.length str in
45 let rec loop ix = is_special_char str.[ix] || ix > 0 && loop (ix - 1) in
48 let maybe_esc_str str =
49 if must_escape str then
50 let estr = String.escaped str in
51 let elen = String.length estr in
52 let res = String.create (elen + 2) in
53 String.blit estr 0 res 1 elen;
55 res.[elen + 1] <- '
"';
59 let pp_maybe_esc_str ppf str = pp_print_string ppf (maybe_esc_str str)
61 (* Output of S-expressions to formatters *)
63 let rec pp_hum_indent indent ppf = function
64 | Atom str -> pp_maybe_esc_str ppf str
66 pp_open_box ppf indent;
67 pp_print_string ppf "(";
68 pp_hum_indent indent ppf h;
69 pp_hum_rest indent ppf t
70 | List [] -> pp_print_string ppf "()"
72 and pp_hum_rest indent ppf = function
74 pp_print_space ppf ();
75 pp_hum_indent indent ppf h;
76 pp_hum_rest indent ppf t
78 pp_print_string ppf ")";
81 let rec pp_mach_internal may_need_space ppf = function
83 let str' = maybe_esc_str str in
84 let new_may_need_space = str' == str in
85 if may_need_space && new_may_need_space then pp_print_string ppf " ";
86 pp_print_string ppf str';
89 pp_print_string ppf "(";
90 let may_need_space = pp_mach_internal false ppf h in
91 pp_mach_rest may_need_space ppf t;
93 | List [] -> pp_print_string ppf "()"; false
95 and pp_mach_rest may_need_space ppf = function
97 let may_need_space = pp_mach_internal may_need_space ppf h in
98 pp_mach_rest may_need_space ppf t
99 | [] -> pp_print_string ppf ")"
101 let pp_hum ppf sexp = pp_hum_indent !default_indent ppf sexp
103 let pp_mach ppf sexp = ignore (pp_mach_internal false ppf sexp)
108 let rec size_loop (v, c as acc) = function
109 | Atom str -> v + 1, c + String.length str
110 | List lst -> List.fold_left size_loop acc lst
112 let size sexp = size_loop (0, 0) sexp
115 (* Buffer conversions *)
117 let to_buffer_hum ~buf ?(indent = !default_indent) sexp =
118 Format.bprintf buf "%a
@?
" (pp_hum_indent indent) sexp
120 let to_buffer_mach ~buf sexp =
121 let rec loop may_need_space = function
123 let str' = maybe_esc_str str in
124 let new_may_need_space = str' == str in
125 if may_need_space && new_may_need_space then Buffer.add_char buf ' ';
126 Buffer.add_string buf str';
129 Buffer.add_char buf '(';
130 let may_need_space = loop false h in
131 loop_rest may_need_space t;
133 | List [] -> Buffer.add_string buf "()"; false
134 and loop_rest may_need_space = function
136 let may_need_space = loop may_need_space h in
137 loop_rest may_need_space t
138 | [] -> Buffer.add_char buf ')' in
139 ignore (loop false sexp)
141 let to_buffer = to_buffer_mach
144 (* Output of S-expressions to I/O-channels *)
146 let buffer () = Buffer.create 4096
148 let with_new_buffer oc f =
149 let buf = buffer () in
151 Buffer.output_buffer oc buf
153 let output_hum oc sexp =
154 with_new_buffer oc (fun buf -> to_buffer_hum sexp ~buf)
156 let output_hum_indent indent oc sexp =
157 with_new_buffer oc (fun buf -> to_buffer_hum ~indent sexp ~buf)
159 let output_mach oc sexp =
160 with_new_buffer oc (fun buf -> to_buffer_mach sexp ~buf)
162 let output = output_mach
165 (* String conversions *)
167 let to_string_hum ?indent sexp =
168 let buf = buffer () in
169 to_buffer_hum ?indent sexp ~buf;
172 let to_string_mach sexp =
173 let buf = buffer () in
174 to_buffer_mach sexp ~buf;
177 let to_string = to_string_mach
182 let scan_sexp ?buf lexbuf = Parser.sexp (Lexer.main ?buf) lexbuf
183 let scan_sexps ?buf lexbuf = Parser.sexps (Lexer.main ?buf) lexbuf
185 let get_main_buf buf =
188 | None -> Buffer.create 64
192 let scan_fold_sexps ?buf ~f ~init lexbuf =
193 let main = get_main_buf buf in
195 match Parser.sexp_opt main lexbuf with
197 | Some sexp -> loop (f sexp acc) in
200 let scan_iter_sexps ?buf ~f lexbuf =
201 let main = get_main_buf buf in
203 match Parser.sexp_opt main lexbuf with
205 | Some sexp -> f sexp; loop () in
208 let scan_cnv_sexps ?buf ~f lexbuf =
209 let coll sexp acc = f sexp :: acc in
210 List.rev (scan_fold_sexps ?buf ~f:coll ~init:[] lexbuf)
213 (* Partial parsing *)
217 mutable text_line : int; (** Line position in parsed text *)
218 mutable text_char : int; (** Character position in parsed text *)
219 mutable buf_pos : int; (** Reading position in buffer *)
222 type 'a parse_result = Done of t * parse_pos | Cont of bool * 'a parse_fun
223 and 'a parse_fun = pos : int -> len : int -> 'a -> 'a parse_result
227 parse_pos : parse_pos;
228 mutable pstack : t list list;
236 parse_state : parse_state;
239 exception ParseError of parse_error
241 let bump_text_line { parse_pos = parse_pos } =
242 parse_pos.text_line <- parse_pos.text_line + 1;
243 parse_pos.text_char <- 1
245 let bump_text_pos { parse_pos = parse_pos } =
246 parse_pos.text_char <- parse_pos.text_char + 1
248 let bump_pos_cont state str ~max_pos ~pos cont =
250 cont state str ~max_pos ~pos:(pos + 1)
252 let bump_line_cont state str ~max_pos ~pos cont =
253 bump_text_line state;
254 cont state str ~max_pos ~pos:(pos + 1)
256 let add_bump bump state str ~max_pos ~pos c cont =
257 Buffer.add_char state.pbuf c;
259 cont state str ~max_pos ~pos:(pos + 1)
261 let add_bump_pos state str ~max_pos ~pos c cont =
262 add_bump bump_text_pos state str ~max_pos ~pos c cont
264 let add_bump_line state str ~max_pos ~pos c cont =
265 add_bump bump_text_line state str ~max_pos ~pos c cont
267 let mk_parse_pos { parse_pos = parse_pos } buf_pos =
268 parse_pos.buf_pos <- buf_pos;
271 let bump_found_atom bump state str ~max_pos ~pos cont =
272 let pbuf = state.pbuf in
273 let atom = Atom (Buffer.contents pbuf) in
274 match state.pstack with
275 | [] -> Done (atom, mk_parse_pos state pos)
276 | rev_sexp_lst :: sexp_stack ->
278 state.pstack <- (atom :: rev_sexp_lst) :: sexp_stack;
280 cont state str ~max_pos ~pos:(pos + 1)
282 let raise_parse_error state location err_msg =
290 raise (ParseError parse_error)
292 let raise_unexpected_char state ~loc pos c =
293 let err_msg = sprintf "unexpected character
: '%c'
" c in
294 let parse_pos = state.parse_pos in
295 parse_pos.buf_pos <- pos;
296 parse_pos.text_char <- parse_pos.text_char + 1;
297 raise_parse_error state loc err_msg
299 (* Macro for generating parsers *)
300 #define MK_PARSER(TYPE, GET_LEN, PARSE, GET_CHAR) \
301 let check_str_bounds loc ~pos ~len (str : TYPE) = \
302 if pos < 0 then invalid_arg (loc ^ ": pos
< 0"); \
303 if len < 0 then invalid_arg (loc ^ ": len < 0"); \
304 let str_len = GET_LEN str in \
305 let pos_len = pos + len in \
306 if pos_len > str_len then invalid_arg (loc ^ ": pos
+ len > str_len"); \
309 let mk_cont name cont state = \
310 let ws_only = state.pstack = [] && Buffer.length state.pbuf = 0 in \
311 let parse_fun ~pos ~len str = \
312 let max_pos = check_str_bounds name ~pos ~len str in \
313 cont state str ~max_pos ~pos \
315 Cont (ws_only, parse_fun) \
317 let rec PARSE state str ~max_pos ~pos = \
318 if pos > max_pos then mk_cont "parse
" PARSE state \
320 match GET_CHAR with \
322 state.pstack <- [] :: state.pstack; \
323 bump_pos_cont state str ~max_pos ~pos PARSE \
325 (match state.pstack with \
326 | [] -> raise_unexpected_char state ~loc:"parse
" pos c \
327 | rev_sexp_lst :: sexp_stack -> \
328 let sexp = List (List.rev rev_sexp_lst) in \
329 match sexp_stack with \
330 | [] -> Done (sexp, mk_parse_pos state (pos + 1)) \
331 | higher_rev_sexp_lst :: higher_sexp_stack -> \
333 (sexp :: higher_rev_sexp_lst) :: higher_sexp_stack; \
334 bump_pos_cont state str ~max_pos ~pos PARSE) \
335 | ' ' | '\009' | '\012' -> bump_pos_cont state str ~max_pos ~pos PARSE \
336 | '\010' -> bump_line_cont state str ~max_pos ~pos PARSE \
337 | '\013' -> bump_line_cont state str ~max_pos ~pos parse_nl \
338 | ';' -> bump_pos_cont state str ~max_pos ~pos parse_comment \
339 | '"'
-> bump_pos_cont state
str ~
max_pos ~pos parse_quoted \
340 | c
-> add_bump_pos state
str ~
max_pos ~pos c parse_atom \
342 and parse_nl state
str ~
max_pos ~pos
= \
343 if pos
> max_pos then mk_cont "parse_nl" parse_nl state \
345 let pos = if GET_CHAR
= '
\010'
then pos + 1 else pos in \
346 PARSE state
str ~
max_pos ~
pos \
348 and parse_comment state
str ~
max_pos ~
pos = \
349 if pos > max_pos then mk_cont "parse_comment" parse_comment state \
351 match GET_CHAR
with \
352 | '
\010'
-> bump_line_cont state
str ~
max_pos ~
pos PARSE \
353 | '
\013'
-> bump_line_cont state
str ~
max_pos ~
pos parse_nl \
354 | _
-> bump_pos_cont state
str ~
max_pos ~
pos parse_comment \
356 and parse_atom state
str ~
max_pos ~
pos = \
357 if pos > max_pos then mk_cont "parse_atom" parse_atom state \
359 match GET_CHAR
with \
360 | ' '
| '
\009'
| '
\012'
-> \
361 bump_found_atom bump_text_pos state
str ~
max_pos ~
pos PARSE \
363 let pbuf = state
.pbuf in \
364 let atom = Atom
(Buffer.contents
pbuf) in \
365 (match state
.pstack
with \
366 | [] -> Done
(atom, mk_parse_pos state
pos) \
367 | rev_sexp_lst
:: sexp_stack
-> \
369 state
.pstack
<- [] :: (atom :: rev_sexp_lst
) :: sexp_stack
; \
370 bump_pos_cont state
str ~
max_pos ~
pos PARSE) \
372 let pbuf = state
.pbuf in \
373 let atom = Atom
(Buffer.contents
pbuf) in \
374 (match state
.pstack
with \
375 | [] -> Done
(atom, mk_parse_pos state
pos) \
376 | rev_sexp_lst
:: sexp_stack
-> \
377 let sexp = List
(List.rev_append rev_sexp_lst
[atom]) in \
378 match sexp_stack
with \
379 | [] -> Done
(sexp, mk_parse_pos state
(pos + 1)) \
380 | higher_rev_sexp_lst
:: higher_sexp_stack
-> \
383 (sexp :: higher_rev_sexp_lst
) :: higher_sexp_stack
; \
384 bump_pos_cont state
str ~
max_pos ~
pos PARSE) \
385 | '
\010'
-> bump_found_atom bump_text_line state
str ~
max_pos ~
pos PARSE \
387 bump_found_atom bump_text_line state
str ~
max_pos ~
pos parse_nl \
389 bump_found_atom bump_text_pos state
str ~
max_pos ~
pos parse_comment \
391 bump_found_atom bump_text_pos state str ~max_pos ~pos parse_quoted \
392 | c -> add_bump_pos state str ~max_pos ~pos c parse_atom \
394 and parse_quoted state str ~max_pos ~pos = \
395 if pos > max_pos then mk_cont "parse_quoted
" parse_quoted state \
397 match GET_CHAR with \
399 let pbuf = state
.pbuf in \
400 let atom = Atom
(Buffer.contents
pbuf) in \
401 (match state
.pstack
with \
402 | [] -> Done
(atom, mk_parse_pos state
(pos + 1)) \
403 | rev_sexp_lst
:: sexp_stack
-> \
405 state
.pstack
<- (atom :: rev_sexp_lst
) :: sexp_stack
; \
406 bump_pos_cont state
str ~
max_pos ~
pos PARSE) \
407 | '
\\'
-> bump_pos_cont state
str ~
max_pos ~
pos parse_escaped \
408 | '
\010'
as c
-> add_bump_line state
str ~
max_pos ~
pos c parse_quoted \
409 | '
\013'
as c
-> add_bump_line state
str ~
max_pos ~
pos c parse_quoted_nl \
410 | c
-> add_bump_pos state
str ~
max_pos ~
pos c parse_quoted \
412 and parse_quoted_nl state
str ~
max_pos ~
pos = \
413 if pos > max_pos then mk_cont "parse_quoted_nl" parse_quoted_nl state \
417 if GET_CHAR
= c then ( \
418 Buffer.add_char state
.pbuf c; \
423 parse_quoted state
str ~
max_pos ~
pos \
425 and parse_escaped state
str ~
max_pos ~
pos = \
426 if pos > max_pos then mk_cont "parse_escaped" parse_escaped state \
428 match GET_CHAR
with \
429 | '
\010'
-> bump_line_cont state
str ~
max_pos ~
pos parse_skip_ws \
430 | '
\013'
-> bump_line_cont state
str ~
max_pos ~
pos parse_skip_ws_nl \
431 | '
0'
.. '
9'
as c -> \
432 bump_text_pos state
; \
433 let d = Char.code
c - 48 in \
434 parse_dec state
str ~
max_pos ~
pos:(pos + 1) ~count
:2 ~
d \
436 bump_text_pos state
; \
437 parse_hex state
str ~
max_pos ~
pos:(pos + 1) ~count
:2 ~
d:0 \
438 | ('
\\'
| '
"' | '\'' ) as c -> \
439 add_bump_pos state str ~max_pos ~pos c parse_quoted \
440 | 'n' -> add_bump_pos state str ~max_pos ~pos '\n' parse_quoted \
441 | 't' -> add_bump_pos state str ~max_pos ~pos '\t' parse_quoted \
442 | 'b' -> add_bump_pos state str ~max_pos ~pos '\b' parse_quoted \
443 | 'r' -> add_bump_pos state str ~max_pos ~pos '\r' parse_quoted \
445 Buffer.add_char state.pbuf '\\'; \
446 add_bump_pos state str ~max_pos ~pos c parse_quoted \
448 and parse_skip_ws state str ~max_pos ~pos = \
449 if pos > max_pos then mk_cont "parse_skip_ws
" parse_skip_ws state \
451 match GET_CHAR with \
452 | ' ' | '\009' -> bump_pos_cont state str ~max_pos ~pos parse_skip_ws \
453 | _ -> parse_quoted state str ~max_pos ~pos \
455 and parse_skip_ws_nl state str ~max_pos ~pos = \
456 if pos > max_pos then mk_cont "parse_skip_ws_nl
" parse_skip_ws_nl state \
458 let pos = if GET_CHAR = '\010' then pos + 1 else pos in \
459 parse_skip_ws state str ~max_pos ~pos \
461 and parse_dec state str ~max_pos ~pos ~count ~d = \
462 if pos > max_pos then mk_cont "parse_dec
" (parse_dec ~count ~d) state \
464 match GET_CHAR with \
465 | '0' .. '9' as c -> \
466 let d = 10 * d + Char.code c - 48 in \
469 let err_msg = sprintf "illegal decimal escape
: \\%d
" d in \
470 raise_parse_error state "parse_dec
" err_msg \
472 add_bump_pos state str ~max_pos ~pos (Char.chr d) parse_quoted \
474 bump_text_pos state; \
475 parse_dec state str ~max_pos ~pos:(pos + 1) ~count:(count - 1) ~d) \
476 | c -> raise_unexpected_char state ~loc:"parse_dec
" pos c \
478 and parse_hex state str ~max_pos ~pos ~count ~d = \
479 if pos > max_pos then mk_cont "parse_hex
" (parse_hex ~count ~d) state \
481 match GET_CHAR with \
482 | '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' as c -> \
484 if c >= 'a' then 87 \
485 else if c >= 'A' then 55 \
488 let d = 16 * d + Char.code c - corr in \
491 let err_msg = sprintf "illegal hexadecimal escape
: \\%x
" d in \
492 raise_parse_error state "parse_hex
" err_msg \
494 add_bump_pos state str ~max_pos ~pos (Char.chr d) parse_quoted \
496 bump_text_pos state; \
497 parse_hex state str ~max_pos ~pos:(pos + 1) ~count:(count - 1) ~d) \
498 | c -> raise_unexpected_char state ~loc:"parse_hex
" pos c \
500 let PARSE ?(text_line = 1) ?(text_char = 1) ?(pos = 0) ?len str = \
504 | None -> GET_LEN str - pos \
506 let max_pos = check_str_bounds "parse
" ~pos ~len str in \
511 text_line = text_line; \
512 text_char = text_char; \
516 pbuf = Buffer.create 128; \
519 PARSE state str ~max_pos ~pos
521 MK_PARSER(string, String.length, parse_str, str.[pos])
523 let parse = parse_str
525 let plain_parse ~pos ~len str = parse ~pos ~len str
528 (* Partial parsing from bigstrings *)
530 (* NOTE: this is really an awful duplication of the code for parsing
531 strings, but since OCaml does not inline higher-order functions known
532 at compile, other solutions would sacrifice a lot of efficiency. *)
536 type bstr = (char, int8_unsigned_elt, c_layout) Array1.t
538 MK_PARSER(bstr, Array1.dim, parse_bstr, str.{pos})
541 (* Input functions *)
543 let reraise_parse_error pe global_pos =
544 let ps = pe.parse_state in
545 let ppos = ps.parse_pos in
546 let new_ppos = { ppos with buf_pos = global_pos + ppos.buf_pos } in
547 let new_ps = { ps with parse_pos = new_ppos } in
548 let new_pe = { pe with parse_state = new_ps } in
549 raise (ParseError new_pe)
551 let input_sexp ?text_line ?text_char ?(buf_pos = 0) ic =
552 let buf = String.create 1 in
553 let rec loop this_parse =
554 let c = input_char ic in
557 try this_parse ~pos:0 ~len:1 buf
558 with ParseError pe -> reraise_parse_error pe buf_pos
561 | Done (sexp, _) -> sexp
562 | Cont (_, this_parse) -> loop this_parse
564 let this_parse ~pos ~len str = parse ?text_line ?text_char ~pos ~len str in
568 ?text_line ?text_char
569 ?(buf_pos = 0) ?(buf = String.create 8192) ic =
570 let rev_sexps_ref = ref [] in
571 let buf_len = String.length buf in
572 let is_incomplete_ref = ref false in
573 let buf_pos_ref = ref buf_pos in
574 let rec loop this_parse pos len =
577 try this_parse ~pos ~len buf
578 with ParseError pe -> reraise_parse_error pe !buf_pos_ref
581 | Done (sexp, new_pos) ->
582 rev_sexps_ref := sexp :: !rev_sexps_ref;
583 let n_parsed = new_pos.buf_pos - pos in
584 is_incomplete_ref := false;
585 let text_line = new_pos.text_line in
586 let text_char = new_pos.text_char in
587 let this_parse ~pos ~len str =
588 parse ~text_line ~text_char ~pos ~len str
590 if n_parsed = len then
591 let new_len = input ic buf 0 buf_len in
592 buf_pos_ref := !buf_pos_ref + new_pos.buf_pos;
593 loop this_parse 0 new_len
594 else loop this_parse new_pos.buf_pos (len - n_parsed)
595 | Cont (ws_only, this_parse) ->
596 is_incomplete_ref := not ws_only;
597 buf_pos_ref := !buf_pos_ref + len + pos;
598 loop this_parse 0 (input ic buf 0 buf_len)
599 else if !is_incomplete_ref then raise End_of_file
602 let this_parse ~pos ~len str = parse ?text_line ?text_char ~pos ~len str in
603 loop this_parse 0 (input ic buf 0 buf_len)
605 let input_sexps ?text_line ?text_char ?buf_pos ?buf ic =
606 let rev_sexps = input_rev_sexps ?text_line ?text_char ?buf_pos ?buf ic in
609 (* of_string and of_bstr *)
611 let of_string_bstr loc this_parse ws_buf get_len get_sub str =
612 match this_parse str with
613 | Done (_, { buf_pos = buf_pos }) when buf_pos <> get_len str ->
614 let prefix_len = min (get_len str - buf_pos) 20 in
615 let prefix = get_sub str buf_pos prefix_len in
617 sprintf "%s
: S
-expression followed by data at position %d
: %S
..."
621 | Done (sexp, _) -> sexp
622 | Cont (ws_only, this_parse) ->
623 if ws_only then failwith (loc ^ ": whitespace only
");
624 match this_parse ~pos:0 ~len:1 ws_buf with
625 | Done (sexp, _) -> sexp
626 | Cont _ -> failwith (loc ^ ": incomplete S
-expression
")
629 of_string_bstr "Sexp.of_string" parse " " String.length String.sub str
631 let get_bstr_sub_str bstr pos len =
632 let str = String.create len in
633 for i = 0 to len - 1 do str.[i] <- bstr.{pos + i} done;
636 let bstr_ws_buf = Array1.create char c_layout 1
637 let () = bstr_ws_buf.{0} <- ' '
640 of_string_bstr "Sexp.of_bstr"
641 parse_bstr bstr_ws_buf Array1.dim get_bstr_sub_str bstr
645 let load_sexp ?(buf = String.create 8192) file =
646 let buf_len = String.length buf in
647 let ic = open_in file in
648 let rec loop this_parse =
649 let len = input ic buf 0 buf_len in
650 if len = 0 then raise End_of_file
652 match this_parse ~pos:0 ~len buf with
653 | Done (sexp, _) -> sexp
654 | Cont (_, this_parse) -> loop this_parse
657 let sexp = loop plain_parse in
660 with exc -> close_in_noerr ic; raise exc
662 let load_rev_sexps ?buf file =
663 let ic = open_in file in
665 let sexps = input_rev_sexps ?buf ic in
668 with exc -> close_in_noerr ic; raise exc
670 let load_sexps ?buf file =
671 let rev_sexps = load_rev_sexps ?buf file in
675 (* Utilities for automated type conversions *)
679 external sexp_of_t : t -> t = "%identity
"
680 external t_of_sexp : t -> t = "%identity
"