Commit | Line | Data |
---|---|---|
2ee368dc NF |
1 | %%% |
2 | %%% Reader | |
3 | %%% | |
4 | ||
5 | -module(reader). | |
6 | ||
7 | -export([read_str/1]). | |
8 | ||
9 | -record(reader, { | |
10 | tokens=[], % the input tokens remaining | |
11 | tree % the subtree parsed by a read_* function | |
12 | }). | |
13 | ||
14 | -spec read_str(string()) -> {ok, term()} | {error, term()}. | |
15 | read_str(Input) -> | |
16 | case tokenize(Input) of | |
17 | {ok, []} -> {ok, none}; | |
18 | {ok, Tokens} -> | |
19 | case read_form(#reader{tokens=Tokens}) of | |
20 | % extract the final result of parsing | |
21 | {ok, Reader} -> {ok, Reader#reader.tree}; | |
22 | {error, Reason} -> {error, Reason} | |
23 | end; | |
24 | {error, Reason} -> {error, Reason} | |
25 | end. | |
26 | ||
27 | -spec read_form(#reader{}) -> {ok, #reader{}} | {error, term()}. | |
28 | read_form(Reader) -> | |
29 | Token = peek(Reader), | |
30 | case Token of | |
31 | close_list -> {error, "unexected ')'"}; | |
32 | close_vector -> {error, "unexected ']'"}; | |
33 | close_map -> {error, "unexected '}'"}; | |
34 | open_list -> read_list(Reader); | |
35 | open_vector -> read_vector(Reader); | |
36 | open_map -> read_map(Reader); | |
37 | quote -> read_quoted(Reader, Token); | |
38 | quasiquote -> read_quoted(Reader, Token); | |
39 | unquote -> read_quoted(Reader, Token); | |
40 | 'splice-unquote' -> read_quoted(Reader, Token); | |
41 | deref -> read_quoted(Reader, Token); | |
42 | 'with-meta' -> read_meta(Reader); | |
43 | _ -> read_atom(Reader) | |
44 | end. | |
45 | ||
46 | read_list(Reader) -> | |
47 | read_seq(Reader, $), open_list, close_list, list). | |
48 | ||
49 | read_vector(Reader) -> | |
50 | % Erlang has no array/vector type, so just use list | |
51 | read_seq(Reader, $], open_vector, close_vector, vector). | |
52 | ||
53 | read_map(Reader) -> | |
54 | case read_seq(Reader, $}, open_map, close_map, map) of | |
55 | {ok, Reader1} -> | |
56 | {map, Map} = Reader1#reader.tree, | |
57 | NewMap = list_to_map(Map), | |
58 | Tokens = Reader1#reader.tokens, | |
59 | {ok, #reader{tokens=Tokens, tree={map, NewMap}}}; | |
60 | {error, Reason} -> {error, Reason} | |
61 | end. | |
62 | ||
63 | read_seq(Reader, CloseChar, OpenDelim, CloseDelim, Type) -> | |
64 | {First, Reader1} = next(Reader), | |
65 | case First of | |
66 | OpenDelim -> | |
67 | case read_seq_tail(Reader1, CloseChar, CloseDelim, []) of | |
68 | {ok, Reader2} -> | |
69 | % prepend our type tag to the result | |
70 | Result = {Type, Reader2#reader.tree}, | |
71 | Reader3 = #reader{tokens=Reader2#reader.tokens, tree=Result}, | |
72 | {ok, Reader3}; | |
73 | {error, Reason} -> {error, Reason} | |
74 | end; | |
75 | Bogey -> {error, io_lib:format("error in read_seq, expected ~p but got ~p", | |
76 | [OpenDelim, Bogey])} | |
77 | end. | |
78 | ||
79 | read_seq_tail(Reader, CloseChar, CloseDelim, AccIn) -> | |
80 | Token = peek(Reader), | |
81 | case Token of | |
82 | [] -> {error, io_lib:format("expected '~c', got EOF", [CloseChar])}; | |
83 | CloseDelim -> | |
84 | {_Token, Reader1} = next(Reader), | |
85 | Reader2 = #reader{tokens=Reader1#reader.tokens, tree=lists:reverse(AccIn)}, | |
86 | {ok, Reader2}; | |
87 | _ -> | |
88 | case read_form(Reader) of | |
89 | {ok, Reader3} -> | |
90 | read_seq_tail(Reader3, CloseChar, CloseDelim, [Reader3#reader.tree|AccIn]); | |
91 | {error, Reason} -> {error, Reason} | |
92 | end | |
93 | end. | |
94 | ||
95 | % Convert a list of key/value pairs into a map. The elements are not | |
96 | % tuples; the keys are the even numbered members, and the values are the | |
97 | % odd numbered members. Fails if list has an odd number of members. | |
98 | list_to_map(L) -> | |
99 | list_to_map(L, #{}). | |
100 | ||
101 | list_to_map([], AccIn) -> | |
102 | AccIn; | |
103 | list_to_map([_H|[]], _AccIn) -> | |
104 | {error, "odd number of hash-map keys/values"}; | |
105 | list_to_map([K,V|T], AccIn) -> | |
106 | list_to_map(T, maps:put(K, V, AccIn)). | |
107 | ||
108 | % Convert syntactic sugar into normalized form (e.g. ` => (quasiquote)). | |
109 | read_quoted(Reader, Token) -> | |
110 | % discard the quoted token | |
111 | {_T, Reader1} = next(Reader), | |
112 | case read_form(Reader1) of | |
113 | {ok, Reader2} -> | |
114 | Result = {list, [{symbol, atom_to_list(Token)}, Reader2#reader.tree]}, | |
115 | {ok, #reader{tokens=Reader2#reader.tokens, tree=Result}}; | |
116 | {error, Reason} -> {error, Reason} | |
117 | end. | |
118 | ||
119 | read_meta(Reader) -> | |
120 | % discard the meta token | |
121 | {_T, Reader1} = next(Reader), | |
122 | case read_form(Reader1) of | |
123 | {ok, Reader2} -> | |
124 | M = Reader2#reader.tree, | |
125 | case read_form(Reader2) of | |
126 | {ok, Reader3} -> | |
127 | X = Reader3#reader.tree, | |
128 | Result = {list, [{symbol, 'with-meta'}, X, M]}, | |
129 | {ok, #reader{tokens=Reader3#reader.tokens, tree=Result}}; | |
130 | {error, Reason} -> {error, Reason} | |
131 | end; | |
132 | {error, Reason} -> {error, Reason} | |
133 | end. | |
134 | ||
135 | read_atom(Reader) -> | |
136 | {Token, Reader1} = next(Reader), | |
137 | Result = case Token of | |
138 | {integer, Value} -> {integer, list_to_integer(Value)}; | |
139 | {string, String} -> {string, String}; | |
140 | {keyword, Keyword} -> {keyword, Keyword}; | |
141 | {symbol, Symbol} -> | |
142 | case Symbol of | |
143 | "true" -> true; | |
144 | "false" -> false; | |
145 | "nil" -> nil; | |
146 | _ -> {symbol, Symbol} | |
147 | end | |
148 | end, | |
149 | {ok, #reader{tokens=Reader1#reader.tokens, tree=Result}}. | |
150 | ||
151 | peek(Reader) -> | |
152 | case Reader#reader.tokens of | |
153 | [] -> []; | |
154 | [H|_T] -> H | |
155 | end. | |
156 | ||
157 | next(Reader) -> | |
158 | [H|NewTokens] = Reader#reader.tokens, | |
159 | {H, #reader{tokens=NewTokens}}. | |
160 | ||
161 | -spec tokenize(string()) -> {ok, [term()]} | {error, term()}. | |
162 | tokenize(Input) -> | |
163 | tokenize(Input, []). | |
164 | ||
165 | -spec tokenize(string(), [term()]) -> {ok, [term()]} | {error, term()}. | |
166 | tokenize(Input, Tokens) -> | |
167 | case lex_single(Input) of | |
168 | eof -> {ok, lists:reverse(Tokens)}; | |
169 | {error, Reason} -> {error, Reason}; | |
170 | {ignored, Rest} -> tokenize(Rest, Tokens); | |
171 | {Token, Rest} -> tokenize(Rest, [Token|Tokens]) | |
172 | end. | |
173 | ||
174 | lex_single([]) -> | |
175 | eof; | |
176 | lex_single([Char|Rest]) -> | |
177 | case Char of | |
178 | $( -> {open_list, Rest}; | |
179 | $) -> {close_list, Rest}; | |
180 | $[ -> {open_vector, Rest}; | |
181 | $] -> {close_vector, Rest}; | |
182 | ${ -> {open_map, Rest}; | |
183 | $} -> {close_map, Rest}; | |
184 | $" -> lex_string(Rest, []); | |
185 | $; -> lex_comment(Rest); | |
186 | $: -> lex_symbol(Rest, keyword); | |
187 | $' -> {quote, Rest}; | |
188 | $` -> {quasiquote, Rest}; | |
189 | $~ -> lex_unquote(Rest); | |
190 | $@ -> {deref, Rest}; | |
191 | $^ -> {'with-meta', Rest}; | |
192 | N when N >= $0, N =< $9 -> lex_number(Rest, [Char]); | |
193 | S when S == 32; S == $,; S == $\r; S == $\n; S == $\t -> lex_spaces(Rest); | |
194 | $\\ -> {error, io_lib:format("bare escape literal ~c~c", [Char, hd(Rest)])}; | |
195 | $. -> {error, "bare dot (.) not supported"}; | |
196 | _ -> lex_symbol([Char|Rest], symbol) | |
197 | end. | |
198 | ||
199 | lex_comment([]) -> | |
200 | {ignored, []}; | |
201 | lex_comment([C|Rest]) when C == $\r; C == $\n -> | |
202 | {ignored, Rest}; | |
203 | lex_comment([_C|Rest]) -> | |
204 | lex_comment(Rest). | |
205 | ||
206 | lex_spaces([C|Rest]) when C == 32; C == $,; C == $\r; C == $\n; C == $\t -> | |
207 | lex_spaces(Rest); | |
208 | lex_spaces(Rest) -> | |
209 | {ignored, Rest}. | |
210 | ||
211 | lex_string([], _String) -> | |
212 | {error, "expected '\"', got EOF"}; | |
213 | lex_string([$\\,Escaped|Rest], String) -> | |
214 | % unescape the string while building it | |
215 | case Escaped of | |
216 | [] -> {error, "end of string reached in escape"}; | |
217 | % TODO: should probably only allow \" and \n | |
218 | _ -> lex_string(Rest, [Escaped|String]) | |
219 | end; | |
220 | lex_string([$"|Rest], String) -> | |
221 | {{string, lists:reverse(String)}, Rest}; | |
222 | lex_string([C|Rest], String) -> | |
223 | lex_string(Rest, [C|String]). | |
224 | ||
225 | lex_number([N|Rest], Number) when N >= $0, N =< $9 -> | |
226 | lex_number(Rest, [N|Number]); | |
227 | lex_number(Rest, Number) -> | |
228 | {{integer, lists:reverse(Number)}, Rest}. | |
229 | ||
230 | % Lex the remainder of either a keyword or a symbol. The Type is used as | |
231 | % the tag for the returned tuple (e.g. the atoms keyword or symbol). | |
232 | lex_symbol(Input, Type) -> | |
233 | IsSymbol = fun(C) -> | |
234 | is_letter(C) orelse is_digit(C) orelse is_symbol(C) | |
235 | end, | |
236 | Symbol = lists:takewhile(IsSymbol, Input), | |
237 | case Symbol of | |
238 | [] -> {error, io_lib:format("invalid symbol: ~10s", [Input])}; | |
239 | _ -> {{Type, Symbol}, lists:sublist(Input, length(Symbol) + 1, length(Input))} | |
240 | end. | |
241 | ||
242 | is_digit(C) -> | |
243 | C >= $0 andalso C =< $9. | |
244 | ||
245 | is_letter(C) -> | |
246 | C >= $a andalso C =< $z orelse C >= $A andalso C =< $Z. | |
247 | ||
248 | is_symbol(C) -> | |
249 | lists:member(C, "!#$%&*+-/:<=>?@^_|\~"). | |
250 | ||
251 | lex_unquote([$@|Rest]) -> | |
252 | {'splice-unquote', Rest}; | |
253 | lex_unquote(Rest) -> | |
254 | {unquote, Rest}. |