Merge pull request #256 from vvakame/impl-ts
[jackhill/mal.git] / cpp / Reader.cpp
1 #include "MAL.h"
2 #include "Types.h"
3
4 #include <regex>
5
6 typedef std::regex Regex;
7
8 static const Regex intRegex("^[-+]?\\d+$");
9 static const Regex closeRegex("[\\)\\]}]");
10
11 static const Regex whitespaceRegex("[\\s,]+|;.*");
12 static const Regex tokenRegexes[] = {
13 Regex("~@"),
14 Regex("[\\[\\]{}()'`~^@]"),
15 Regex("\"(?:\\\\.|[^\\\\\"])*\""),
16 Regex("[^\\s\\[\\]{}('\"`,;)]+"),
17 };
18
19 class Tokeniser
20 {
21 public:
22 Tokeniser(const String& input);
23
24 String peek() const {
25 ASSERT(!eof(), "Tokeniser reading past EOF in peek\n");
26 return m_token;
27 }
28
29 String next() {
30 ASSERT(!eof(), "Tokeniser reading past EOF in next\n");
31 String ret = peek();
32 nextToken();
33 return ret;
34 }
35
36 bool eof() const {
37 return m_iter == m_end;
38 }
39
40 private:
41 void skipWhitespace();
42 void nextToken();
43
44 bool matchRegex(const Regex& regex);
45
46 typedef String::const_iterator StringIter;
47
48 String m_token;
49 StringIter m_iter;
50 StringIter m_end;
51 };
52
53 Tokeniser::Tokeniser(const String& input)
54 : m_iter(input.begin())
55 , m_end(input.end())
56 {
57 nextToken();
58 }
59
60 bool Tokeniser::matchRegex(const Regex& regex)
61 {
62 if (eof()) {
63 return false;
64 }
65
66 std::smatch match;
67 auto flags = std::regex_constants::match_continuous;
68 if (!std::regex_search(m_iter, m_end, match, regex, flags)) {
69 return false;
70 }
71
72 ASSERT(match.size() == 1, "Should only have one submatch, not %lu\n",
73 match.size());
74 ASSERT(match.position(0) == 0, "Need to match first character\n");
75 ASSERT(match.length(0) > 0, "Need to match a non-empty string\n");
76
77 // Don't advance m_iter now, do it after we've consumed the token in
78 // next(). If we do it now, we hit eof() when there's still one token left.
79 m_token = match.str(0);
80
81 return true;
82 }
83
84 void Tokeniser::nextToken()
85 {
86 m_iter += m_token.size();
87
88 skipWhitespace();
89 if (eof()) {
90 return;
91 }
92
93 for (auto &it : tokenRegexes) {
94 if (matchRegex(it)) {
95 return;
96 }
97 }
98
99 String mismatch(m_iter, m_end);
100 if (mismatch[0] == '"') {
101 MAL_CHECK(false, "Expected \", got EOF");
102 }
103 else {
104 MAL_CHECK(false, "Unexpected \"%s\"", mismatch.c_str());
105 }
106 }
107
108 void Tokeniser::skipWhitespace()
109 {
110 while (matchRegex(whitespaceRegex)) {
111 m_iter += m_token.size();
112 }
113 }
114
115 static malValuePtr readAtom(Tokeniser& tokeniser);
116 static malValuePtr readForm(Tokeniser& tokeniser);
117 static void readList(Tokeniser& tokeniser, malValueVec* items,
118 const String& end);
119 static malValuePtr processMacro(Tokeniser& tokeniser, const String& symbol);
120
121 malValuePtr readStr(const String& input)
122 {
123 Tokeniser tokeniser(input);
124 if (tokeniser.eof()) {
125 throw malEmptyInputException();
126 }
127 return readForm(tokeniser);
128 }
129
130 static malValuePtr readForm(Tokeniser& tokeniser)
131 {
132 MAL_CHECK(!tokeniser.eof(), "Expected form, got EOF");
133 String token = tokeniser.peek();
134
135 MAL_CHECK(!std::regex_match(token, closeRegex),
136 "Unexpected \"%s\"", token.c_str());
137
138 if (token == "(") {
139 tokeniser.next();
140 std::unique_ptr<malValueVec> items(new malValueVec);
141 readList(tokeniser, items.get(), ")");
142 return mal::list(items.release());
143 }
144 if (token == "[") {
145 tokeniser.next();
146 std::unique_ptr<malValueVec> items(new malValueVec);
147 readList(tokeniser, items.get(), "]");
148 return mal::vector(items.release());
149 }
150 if (token == "{") {
151 tokeniser.next();
152 malValueVec items;
153 readList(tokeniser, &items, "}");
154 return mal::hash(items.begin(), items.end(), false);
155 }
156 return readAtom(tokeniser);
157 }
158
159 static malValuePtr readAtom(Tokeniser& tokeniser)
160 {
161 struct ReaderMacro {
162 const char* token;
163 const char* symbol;
164 };
165 ReaderMacro macroTable[] = {
166 { "@", "deref" },
167 { "`", "quasiquote" },
168 { "'", "quote" },
169 { "~@", "splice-unquote" },
170 { "~", "unquote" },
171 };
172
173 struct Constant {
174 const char* token;
175 malValuePtr value;
176 };
177 Constant constantTable[] = {
178 { "false", mal::falseValue() },
179 { "nil", mal::nilValue() },
180 { "true", mal::trueValue() },
181 };
182
183 String token = tokeniser.next();
184 if (token[0] == '"') {
185 return mal::string(unescape(token));
186 }
187 if (token[0] == ':') {
188 return mal::keyword(token);
189 }
190 if (token == "^") {
191 malValuePtr meta = readForm(tokeniser);
192 malValuePtr value = readForm(tokeniser);
193 // Note that meta and value switch places
194 return mal::list(mal::symbol("with-meta"), value, meta);
195 }
196 for (auto &constant : constantTable) {
197 if (token == constant.token) {
198 return constant.value;
199 }
200 }
201 for (auto &macro : macroTable) {
202 if (token == macro.token) {
203 return processMacro(tokeniser, macro.symbol);
204 }
205 }
206 if (std::regex_match(token, intRegex)) {
207 return mal::integer(token);
208 }
209 return mal::symbol(token);
210 }
211
212 static void readList(Tokeniser& tokeniser, malValueVec* items,
213 const String& end)
214 {
215 while (1) {
216 MAL_CHECK(!tokeniser.eof(), "Expected \"%s\", got EOF", end.c_str());
217 if (tokeniser.peek() == end) {
218 tokeniser.next();
219 return;
220 }
221 items->push_back(readForm(tokeniser));
222 }
223 }
224
225 static malValuePtr processMacro(Tokeniser& tokeniser, const String& symbol)
226 {
227 return mal::list(mal::symbol(symbol), readForm(tokeniser));
228 }