case /^:/:
return ":" token
case /^"/:
- return reader_read_string(token)
+ if (token ~ /"$/) {
+ return reader_read_string(token)
+ } else {
+ return "!\"Expected '\"', got EOF."
+ }
case /^-?[0-9]+$/:
return "+" token
default:
function reader_tokenizer(str, reader, len, r)
{
- for (len = 0; match(str, /^[ \t\r\n,]*(~@|[\[\]{}()'`~^@]|\"(\\[^\r\n]|[^\\"\r\n])*\"|;[^\r\n]*|[^ \t\r\n\[\]{}('"`,;)^~@][^ \t\r\n\[\]{}('"`,;)]*)/, r); ) {
+ for (len = 0; match(str, /^[ \t\r\n,]*(~@|[\[\]{}()'`~^@]|\"(\\[^\r\n]|[^\\"\r\n])*\"?|;[^\r\n]*|[^ \t\r\n\[\]{}('"`,;)^~@][^ \t\r\n\[\]{}('"`,;)]*)/, r); ) {
if (substr(r[1], 1, 1) != ";") {
reader[len++] = r[1]
}
READ_STRING:
REM PRINT "READ_STRING"
C=ASC(MID$(T$,LEN(T$),1))
- IF C<>34 THEN R=-1:ER=-1:E$="expected '"+CHR$(34)+"'":GOTO READ_FORM_RETURN
+ IF C<>34 THEN R=-1:ER=-1:E$="expected '"+CHR$(34)+"', got EOF":GOTO READ_FORM_RETURN
R$=MID$(T$,2,LEN(T$)-2)
S1$=CHR$(92)+CHR$(92):S2$=CHR$(127):GOSUB REPLACE: REM protect backslashes
S1$=CHR$(92)+CHR$(34):S2$=CHR$(34):GOSUB REPLACE: REM unescape quotes
Reader *reader = reader_new();
- regex = g_regex_new ("[\\s ,]*(~@|[\\[\\]{}()'`~@]|\"(?:[\\\\].|[^\\\\\"])*\"|;.*|[^\\s \\[\\]{}()'\"`~@,;]*)", 0, 0, &err);
+ regex = g_regex_new ("[\\s ,]*(~@|[\\[\\]{}()'`~@]|\"(?:[\\\\].|[^\\\\\"])*\"?|;.*|[^\\s \\[\\]{}()'\"`~@,;]*)", 0, 0, &err);
g_regex_match (regex, line, 0, &matchInfo);
if (err != NULL) {
token = reader_next(reader);
//g_print("read_atom token: %s\n", token);
- regex = g_regex_new ("(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^\"(.*)\"$|:(.*)|(^[^\"]*$)", 0, 0, &err);
+ regex = g_regex_new ("(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^\"(.*)\"?$|:(.*)|(^[^\"]*$)", 0, 0, &err);
g_regex_match (regex, token, 0, &matchInfo);
if (g_match_info_fetch_pos(matchInfo, 1, &pos, NULL) && pos != -1) {
atom = &mal_false;
} else if (g_match_info_fetch_pos(matchInfo, 6, &pos, NULL) && pos != -1) {
//g_print("read_atom string: %s\n", token);
+ int end = strlen(token)-1;
+ if (token[end] != '"') { abort("expected '\"', got EOF"); }
+ token[end] = '\0';
atom = malval_new_string(g_strcompress(g_match_info_fetch(matchInfo, 6)));
} else if (g_match_info_fetch_pos(matchInfo, 7, &pos, NULL) && pos != -1) {
//g_print("read_atom keyword\n");
(def tok-re #"[\s,]*(~@|[\[\]{}()'`~^@]|\"(?:[\\].|[^\\\"])*\"?|;.*|[^\s\[\]{}()'\"`@,;]+)")
(def int-re #"^-?[0-9]+$")
+(def badstr-re #"^\"(.*)[^\"]$")
(def str-re #"^\"(.*)\"$")
(defn tokenize [s]
(defn read-atom [rdr]
(let [token (rdr-next rdr)]
(cond
- (re-seq int-re token) #?(:cljs (js/parseInt token)
- :clj (Integer/parseInt token))
- (re-seq str-re token) (unescape (second (re-find str-re token)))
- (= \: (get token 0)) (keyword (subs token 1))
- (= "nil" token) nil
- (= "true" token) true
- (= "false" token) false
- :else (symbol token))))
+ (re-seq int-re token) #?(:cljs (js/parseInt token)
+ :clj (Integer/parseInt token))
+ (re-seq badstr-re token) (throw-str (str "expected '\"', got EOF"))
+ (re-seq str-re token) (unescape (second (re-find str-re token)))
+ (= \: (get token 0)) (keyword (subs token 1))
+ (= "nil" token) nil
+ (= "true" token) true
+ (= "false" token) false
+ :else (symbol token))))
(declare read-form)
when token == "true" then true
when token == "false" then false
when token == "nil" then nil
- when token[0] == '"' then token[1..-2].gsub(/\\(.)/, {"\\\"" => "\"",
- "\\n" => "\n",
- "\\\\" => "\\"})
+ when token[0] == '"'
+ parse_error "expected '\"', got EOF" if token[-1] != '"'
+ token[1..-2].gsub(/\\(.)/, {"\\\"" => "\"",
+ "\\n" => "\n",
+ "\\\\" => "\\"})
when token[0] == ':' then "\u029e#{token[1..-1]}"
else Mal::Symbol.new token
end
end
def tokenize(str)
- regex = /[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)/
+ regex = /[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)/
str.scan(regex).map { |m| m[1] }.reject(&.empty?)
end
public static List<string> tokenize(string str) {
List<string> tokens = new List<string>();
- string pattern = @"[\s ,]*(~@|[\[\]{}()'`~@]|""(?:[\\].|[^\\""])*""|;.*|[^\s \[\]{}()'""`~@,;]*)";
+ string pattern = @"[\s ,]*(~@|[\[\]{}()'`~@]|""(?:[\\].|[^\\""])*""?|;.*|[^\s \[\]{}()'""`~@,;]*)";
Regex regex = new Regex(pattern);
foreach (Match match in regex.Matches(str)) {
string token = match.Groups[1].Value;
public static MalVal read_atom(Reader rdr) {
string token = rdr.next();
- string pattern = @"(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^("".*"")$|:(.*)|(^[^""]*$)";
+ string pattern = @"(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^("".*)|:(.*)|(^[^""]*$)";
Regex regex = new Regex(pattern);
Match match = regex.Match(token);
//Console.WriteLine("token: ^" + token + "$");
return Mal.types.False;
} else if (match.Groups[6].Value != String.Empty) {
string str = match.Groups[6].Value;
+ if (str[str.Length-1] != '"') {
+ throw new ParseError("expected '\"', got EOF");
+ }
str = str.Substring(1, str.Length-2)
.Replace("\\\\", "\u029e")
.Replace("\\\"", "\"")
}
}
-auto tokenize_ctr = ctRegex!(r"[\s,]*(~@|[\[\]{}()'`~^@]|" `"` `(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"` r"`,;)]*)");
+auto tokenize_ctr = ctRegex!(r"[\s,]*(~@|[\[\]{}()'`~^@]|" `"` `(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"` r"`,;)]*)");
string[] tokenize(string str)
{
case ':':
return new MalString("\u029e" ~ token[1..$]);
case '"':
+ if (token[$-1] != '"')
+ {
+ throw new Exception("expected '\"', got EOF");
+ }
return parse_string(token);
default:
auto captures = matchFirst(token, integer_ctr);
import 'types.dart';
final malRegExp = new RegExp(
- r"""[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)""");
+ r"""[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)""");
class Reader {
final List<String> tokens;
}
if (token[0] == '"') {
+ if (token[token.length -1 ] != '"') {
+ throw new ParseException("expected '\"', got EOF");
+ }
var sanitizedToken = token
// remove surrounding quotes
.substring(1, token.length - 1)
make lib.types math.parser regexp sequences splitting strings ;
IN: lib.reader
-CONSTANT: token-regex R/ (~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)~^@]+)/
+CONSTANT: token-regex R/ (~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)~^@]+)/
DEFER: read-form
: (read-string) ( str -- maltype )
- rest but-last R/ \\./ [
- {
- { [ dup >string "\\\\" = ] [ drop "\\" ] }
- { [ dup >string "\\n" = ] [ drop "\n" ] }
- { [ dup >string "\\\"" = ] [ drop "\"" ] }
- [ ]
- } cond
- ] re-replace-with ;
+ dup last CHAR: " = [
+ rest but-last R/ \\./ [
+ {
+ { [ dup >string "\\\\" = ] [ drop "\\" ] }
+ { [ dup >string "\\n" = ] [ drop "\n" ] }
+ { [ dup >string "\\\"" = ] [ drop "\"" ] }
+ [ ]
+ } cond
+ ] re-replace-with
+ ] [
+ "expected '\"', got EOF" throw
+ ] if ;
: (read-atom) ( str -- maltype )
{
{
private static Str[] tokenize(Str s)
{
- r := Regex <|[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)|>
+ r := Regex <|[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)|>
m := r.matcher(s)
tokens := Str[,]
while (m.find())
{
token := reader.next
intRegex := Regex <|^-?\d+$|>
+ strRegex := Regex <|^".*"|>
+ strBadRegex := Regex <|^".*|>
if (token == "nil") return MalNil.INSTANCE
if (token == "true") return MalTrue.INSTANCE
if (token == "false") return MalFalse.INSTANCE
if (intRegex.matches(token)) return MalInteger(token.toInt)
+ if (strRegex.matches(token)) return MalString.make(unescape_str(token[1..-2]))
+ if (strBadRegex.matches(token)) throw Err("expected '\"', got EOF")
if (token[0] == '"') return MalString.make(unescape_str(token[1..-2]))
if (token[0] == ':') return MalString.makeKeyword(token[1..-1])
return MalSymbol(token)
results := make([]string, 0, 1)
// Work around lack of quoting in backtick
re := regexp.MustCompile(`[\s,]*(~@|[\[\]{}()'` + "`" +
- `~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"` + "`" +
+ `~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"` + "`" +
`,;)]*)`)
for _, group := range re.FindAllStringSubmatch(str, -1) {
if (group[1] == "") || (group[1][0] == ';') {
}
return i, nil
} else if (*token)[0] == '"' {
+ if (*token)[len(*token)-1] != '"' {
+ return nil, errors.New("expected '\"', got EOF")
+ }
str := (*token)[1 : len(*token)-1]
return strings.Replace(
strings.Replace(
}
def static tokenizer(String str) {
- def m = str =~ /[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)/
+ def m = str =~ /[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)/
def tokens = []
while (m.find()) {
String token = m.group(1)
def static read_atom(Reader rdr) {
def token = rdr.next()
- def m = token =~ /(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^"(.*)"$|:(.*)|(^[^"]*$)/
+ def m = token =~ /(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^"(.*)"$|^"(.*)$|:(.*)|(^[^"]*$)/
if (!m.find()) {
throw new MalException("unrecognized token '$token'")
}
} else if (m.group(5) != null) {
false
} else if (m.group(6) != null) {
+ if (token[token.length() - 1] != '"') {
+ throw new MalException("expected '\"', got EOF")
+ }
StringEscapeUtils.unescapeJava(m.group(6))
} else if (m.group(7) != null) {
- "\u029e" + m.group(7)
+ throw new MalException("expected '\"', got EOF")
} else if (m.group(8) != null) {
- new MalSymbol(m.group(8))
+ "\u029e" + m.group(8)
+ } else if (m.group(9) != null) {
+ new MalSymbol(m.group(9))
} else {
throw new MalException("unrecognized '${m.group(0)}'")
}
"\n"),
"\""),
"\\"));
- case _ if (re_str.match(token)):
+ case _ if (re_str_bad.match(token)):
throw 'expected \'"\', got EOF';
case _:
MalSymbol(token);
(def tok-re (.compile re "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:[\\\\].|[^\\\\\"])*\"?|;.*|[^\\s\\[\\]{}()'\"`@,;]+)"))
(def int-re (.compile re "-?[0-9]+$"))
+(def str-re (.compile re "^\".*\"$"))
+(def str-bad-re (.compile re "^\".*$"))
(defn tokenize [str]
(list-comp
(setv token (.next rdr))
(if
(.match re int-re token) (int token)
- (= "\"" (get token 0)) (Str (unescape (cut token 1 -1)))
+ (.match re str-re token) (Str (unescape (cut token 1 -1)))
+ (.match re str-bad-re token) (raise (Exception (+ "expected '\"', got EOF")))
(= ":" (get token 0)) (Keyword token)
(= "nil" token) None
(= "true" token) True
)
)
- tokenizerRegex := Regex with("[\\s ,]*(~@|[\\[\\]{}()'`~@]|\"(?:[\\\\].|[^\\\\\"])*\"|;.*|[^\\s \\[\\]{}()'\"`~@,;]*)")
+ tokenizerRegex := Regex with("[\\s ,]*(~@|[\\[\\]{}()'`~@]|\"(?:[\\\\].|[^\\\\\"])*\"?|;.*|[^\\s \\[\\]{}()'\"`~@,;]*)")
tokenize := method(str,
tokenizerRegex matchesIn(str) \
numberRegex := Regex with("^-?[0-9]+$")
read_string := method(token,
+ (token endsWithSeq("\"")) ifFalse(Exception raise("expected '\"', got EOF"))
placeholder := 127 asCharacter
token exSlice(1, -1) replaceSeq("\\\\", placeholder) replaceSeq("\\\"", "\"") replaceSeq("\\n", "\n") replaceSeq(placeholder, "\\")
)
public static ArrayList<String> tokenize(String str) {
ArrayList<String> tokens = new ArrayList<String>();
- Pattern pattern = Pattern.compile("[\\s ,]*(~@|[\\[\\]{}()'`~@]|\"(?:[\\\\].|[^\\\\\"])*\"|;.*|[^\\s \\[\\]{}()'\"`~@,;]*)");
+ Pattern pattern = Pattern.compile("[\\s ,]*(~@|[\\[\\]{}()'`~@]|\"(?:[\\\\].|[^\\\\\"])*\"?|;.*|[^\\s \\[\\]{}()'\"`~@,;]*)");
Matcher matcher = pattern.matcher(str);
while (matcher.find()) {
String token = matcher.group(1);
public static MalVal read_atom(Reader rdr)
throws ParseError {
String token = rdr.next();
- Pattern pattern = Pattern.compile("(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^\"(.*)\"$|:(.*)|(^[^\"]*$)");
+ Pattern pattern = Pattern.compile("(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^\"(.*)\"$|^\"(.*)$|:(.*)|(^[^\"]*$)");
Matcher matcher = pattern.matcher(token);
if (!matcher.find()) {
throw new ParseError("unrecognized token '" + token + "'");
} else if (matcher.group(6) != null) {
return new MalString(StringEscapeUtils.unescapeJson(matcher.group(6)));
} else if (matcher.group(7) != null) {
- return new MalString("\u029e" + matcher.group(7));
+ throw new ParseError("expected '\"', got EOF");
} else if (matcher.group(8) != null) {
- return new MalSymbol(matcher.group(8));
+ return new MalString("\u029e" + matcher.group(8));
+ } else if (matcher.group(9) != null) {
+ return new MalSymbol(matcher.group(9));
} else {
throw new ParseError("unrecognized '" + matcher.group(0) + "'");
}
function tokenize(str)
- re = r"[\s,]*(~@|[\[\]{}()'`~^@]|\"(?:\\.|[^\\\"])*\"|;.*|[^\s\[\]{}('\"`,;)]*)"
+ re = r"[\s,]*(~@|[\[\]{}()'`~^@]|\"(?:\\.|[^\\\"])*\"?|;.*|[^\s\[\]{}('\"`,;)]*)"
tokens = map((m) -> m.captures[1], eachmatch(re, str))
filter((t) -> t != "" && t[1] != ';', tokens)
end
replace(token[2:end-1], r"\\.", (r) -> get(Dict("\\n"=>"\n",
"\\\""=>"\"",
"\\\\"=>"\\"), r, r))
+ elseif ismatch(r"^\".*$", token)
+ error("expected '\"', got EOF")
elseif token[1] == ':'
"\u029e$(token[2:end])"
elseif token == "nil"
import kotlin.text.Regex
-val TOKEN_REGEX = Regex("[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"|;.*|[^\\s\\[\\]{}('\"`,;)]*)")
-val ATOM_REGEX = Regex("(^-?[0-9]+$)|(^nil$)|(^true$)|(^false$)|^\"(.*)\"$|:(.*)|(^[^\"]*$)")
+val TOKEN_REGEX = Regex("[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;.*|[^\\s\\[\\]{}('\"`,;)]*)")
+val ATOM_REGEX = Regex("(^-?[0-9]+$)|(^nil$)|(^true$)|(^false$)|^\"(.*)\"$|^\"(.*)$|:(.*)|(^[^\"]*$)")
class Reader(sequence: Sequence<String>) {
val tokens = sequence.iterator()
else m.groups[1]?.value.toString()
})
} else if (groups[6]?.value != null) {
- MalKeyword(groups[6]?.value as String)
+ throw MalReaderException("expected '\"', got EOF")
} else if (groups[7]?.value != null) {
- MalSymbol(groups[7]?.value as String)
+ MalKeyword(groups[7]?.value as String)
+ } else if (groups[8]?.value != null) {
+ MalSymbol(groups[8]?.value as String)
} else {
throw MalReaderException("Unrecognized token: " + next)
}
[\s,]* # whitespace or commas
( ~@ # special two-char ~@
| [\[\]{}()'`~^@] # special single char one of []{}'`~^@
- | "(?:\\.| [^\\"])*" # double-quoted string
+ | "(?:\\.| [^\\"])*"? # double-quoted string
| ;.* # any seq of chars starting ;
| [^\s\[\]{}('"`,;)]+ # seq of non-special chars: symbols, numbers,
) # "true", "false" and "nil".
if token in constants
{type: \const, value: reader.next!}
else if token[0] == '"'
+ if not token.endsWith '"'
+ parse-error "expected '\"', got EOF"
{type: \string, value: decode-string reader.next!}
else if token.match /^-?\d+$/
{type: \int, value: parseInt reader.next!}
make "w word :w :c
make "rest butfirst :rest
]
-(throw "error [Expected closing quotes])
+(throw "error [Expected closing quotes, not EOF])
end
to read_next_token :s
$(call __string,$(strip $(call READ_STRING,$(1))))\
$(eval $(if $(filter $(DQUOTE),$(word 1,$($(1)))),\
$(eval $(1) := $(wordlist 2,$(words $($(1))),$($(1)))),\
- $(call _error,Expected '$(DQUOTE)' in; $($(1))))),\
+ $(call _error,Expected '$(DQUOTE)' in; $($(1))$(COMMA) got EOF))),\
$(if $(filter $(COLON),$(ch)),\
$(eval $(1) := $(wordlist 2,$(words $($(1))),$($(1))))\
$(call _keyword,$(call READ_KEYWORD,$(1))),\
classdef reader
methods (Static = true)
function tokens = tokenize(str)
- re = '[\s,]*(~@|[\[\]{}()''`~^@]|"(?:\\.|[^\\"])*"|;[^\n]*|[^\s\[\]{}(''"`,;)]*)';
+ re = '[\s,]*(~@|[\[\]{}()''`~^@]|"(?:\\.|[^\\"])*"?|;[^\n]*|[^\s\[\]{}(''"`,;)]*)';
% extract the capture group (to ignore spaces and commas)
tokens = cellfun(@(x) x(1), regexp(str, re, 'tokens'));
comments = cellfun(@(x) length(x) > 0 && x(1) == ';', tokens);
if not(isempty(regexp(token, '^-?[0-9]+$', 'match')))
atm = str2double(token);
elseif strcmp(token(1), '"')
+ if not(token(end) == '"')
+ error('expected ''"'', got EOF');
+ end
atm = token(2:length(token)-1);
atm = strrep(atm, '\\', char(255));
atm = strrep(atm, '\"', '"');
["re-matches", "re", "strn", ["concat", "acc", "g1"]]]]]],
["def", "tokenize", ["fn", ["strn"],
- ["let", ["re-str", ["`", "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"|;.*|[^\\s\\[\\]{}('\"`,;)]*)"],
+ ["let", ["re-str", ["`", "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;.*|[^\\s\\[\\]{}('\"`,;)]*)"],
"re", ["RegExp", "re-str", ["`", "g"]]],
[".",
["re-matches", "re", "strn", ["`", []]],
["if", [".", "token", ["`", "match"], ["RegExp", ["`", "^-?[0-9]+$"]]],
["parseInt", "token", 10],
["if", ["=", ["`", "\""], ["get", "token", 0]],
- [".",
- ["slice", "token", 1, ["-", ["count", "token"], 1]],
- ["`", "replace"], ["RegExp", ["`", "\\\\(.)"], ["`", "g"]],
- ["fn", ["_", "c"],
- ["if", ["=", "c", ["`", "n"]],
- ["`", "\n"],
- "c"]]],
+ ["if", ["=", ["`", "\""], ["get", "token", ["-", ["count", "token"], 1]]],
+ [".",
+ ["slice", "token", 1, ["-", ["count", "token"], 1]],
+ ["`", "replace"], ["RegExp", ["`", "\\\\(.)"], ["`", "g"]],
+ ["fn", ["_", "c"],
+ ["if", ["=", "c", ["`", "n"]],
+ ["`", "\n"],
+ "c"]]],
+ ["throw", ["`", "expected '\"', got EOF"]]],
["if", ["=", ["`", ":"], ["get", "token", 0]],
["keyword", ["slice", "token", 1]],
["if", ["=", ["`", "nil"], "token"],
import re, strutils, sequtils, types
let
- tokenRE = re"""[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)"""
+ tokenRE = re"""[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)"""
intRE = re"-?[0-9]+$"
type
proc read_atom(r: var Reader): MalType =
let t = r.next
if t.match(intRE): number t.parseInt
- elif t[0] == '"': str t[1 .. <t.high].multiReplace(("\\\"", "\""), ("\\n", "\n"), ("\\\\", "\\"))
+ elif t[0] == '"':
+ if t[^1] != '"': raise newException(ValueError, "expected '\"', got EOF")
+ str t[1 .. <t.high].multiReplace(("\\\"", "\""), ("\\n", "\n"), ("\\\\", "\\"))
elif t[0] == ':': keyword t[1 .. t.high]
elif t == "nil": nilObj
elif t == "true": trueObj
NSObject * read_atom(Reader * rdr) {
NSRegularExpression *regex = [NSRegularExpression
- regularExpressionWithPattern:@"(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^\"(.*)\"$|:(.*)|(^[^\"]*$)"
+ regularExpressionWithPattern:@"(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^\"(.*)\"$|^\"(.*)$|:(.*)|(^[^\"]*$)"
options:0
error:NULL];
NSNumberFormatter *numf = [[NSNumberFormatter alloc] init];
stringByReplacingOccurrencesOfString:@"\\\"" withString:@"\""]
stringByReplacingOccurrencesOfString:@"\\n" withString:@"\n"]
stringByReplacingOccurrencesOfString:@"\u029e" withString:@"\\"];
- } else if ([match rangeAtIndex:7].location < -1ULL/2) { // keyword
+ } else if ([match rangeAtIndex:7].location < -1ULL/2) { // string
+ @throw @"read_atom: expected '\"', got EOF";
+ } else if ([match rangeAtIndex:8].location < -1ULL/2) { // keyword
return [NSString stringWithFormat:@"\u029e%@",
- [token substringWithRange:[match rangeAtIndex:7]]];
- } else if ([match rangeAtIndex:8].location < -1ULL/2) { // symbol
+ [token substringWithRange:[match rangeAtIndex:8]]];
+ } else if ([match rangeAtIndex:9].location < -1ULL/2) { // symbol
return [MalSymbol stringWithString:token];
}
}
Str : string;
begin
RE := TRegExpr.Create;
- RE.Expression := '(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^(\".*\")$|:(.*)|(^[^\"]*$)';
+ RE.Expression := '(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^(\".*\")$|^(\".*)$|:(.*)|(^[^\"]*$)';
Token := Reader.Next();
//WriteLn('token: ' + Token);
if RE.Exec(Token) then
read_atom := TMalString.Create(Str)
end
else if RE.Match[7] <> '' then
- read_atom := TMalString.Create(#127 + RE.Match[7])
+ raise Exception.Create('expected ''"'', got EOF')
else if RE.Match[8] <> '' then
+ read_atom := TMalString.Create(#127 + RE.Match[8])
+ else if RE.Match[9] <> '' then
read_atom := TMalSymbol.Create(Token);
end
else
"" (List.map (function | Str.Delim x -> f x | Str.Text x -> x)
(Str.full_split re str))
-let token_re = (Str.regexp "~@\\|[][{}()'`~^@]\\|\"\\(\\\\.\\|[^\"]\\)*\"\\|;.*\\|[^][ \n{}('\"`,;)]*")
+let token_re = (Str.regexp "~@\\|[][{}()'`~^@]\\|\"\\(\\\\.\\|[^\"]\\)*\"?\\|;.*\\|[^][ \n{}('\"`,;)]*")
type reader = {
form : Types.mal_type;
| _ -> (match token.[1] with
| '0'..'9' -> T.Int (int_of_string token)
| _ -> Types.symbol token))
- | '"' -> T.String (gsub (Str.regexp "\\\\.")
- (function
- | "\\n" -> "\n"
- | x -> String.sub x 1 1)
- (String.sub token 1 ((String.length token) - 2)))
+ | '"' -> (match token.[String.length token - 1] with
+ | '"' -> T.String (gsub (Str.regexp "\\\\.")
+ (function
+ | "\\n" -> "\n"
+ | x -> String.sub x 1 1)
+ (String.sub token 1 ((String.length token) - 2)))
+ | _ -> output_string stderr ("expected '\"', got EOF\n");
+ flush stderr;
+ raise End_of_file)
| ':' -> T.Keyword (Str.replace_first (Str.regexp "^:") "" token)
| _ -> Types.symbol token
}
function tokenize($str) {
- $pat = "/[\s,]*(php\/|~@|[\[\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"|;.*|[^\s\[\]{}('\"`,;)]*)/";
+ $pat = "/[\s,]*(php\/|~@|[\[\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;.*|[^\s\[\]{}('\"`,;)]*)/";
preg_match_all($pat, $str, $matches);
return array_values(array_filter($matches[1], '_real_token'));
}
if (preg_match("/^-?[0-9]+$/", $token)) {
return intval($token, 10);
} elseif ($token[0] === "\"") {
+ if (substr($token, -1) !== "\"") {
+ throw new Exception("expected '\"', got EOF");
+ }
$str = substr($token, 1, -1);
$str = str_replace('\\\\', chr(0x7f), $str);
$str = str_replace('\\"', '"', $str);
CREATE FUNCTION reader.tokenize(str varchar) RETURNS varchar[] AS $$
DECLARE
- re varchar = E'[[:space:] ,]*(~@|[\\[\\]{}()\'`~@]|"(?:[\\\\].|[^\\\\"])*"|;[^\n]*|[^\\s \\[\\]{}()\'"`~@,;]*)';
+ re varchar = E'[[:space:] ,]*(~@|[\\[\\]{}()\'`~@]|"(?:[\\\\].|[^\\\\"])*"?|;[^\n]*|[^\\s \\[\\]{}()\'"`~@,;]*)';
BEGIN
RETURN ARRAY(SELECT tok FROM
(SELECT (regexp_matches(str, re, 'g'))[1] AS tok) AS x
str := replace(str, '\n', E'\n');
str := replace(str, chr(CAST(x'7f' AS integer)), E'\\');
result := types._stringv(str);
+ ELSIF token ~ '^".*' THEN -- unclosed string
+ RAISE EXCEPTION 'expected ''"'', got EOF';
ELSIF token ~ '^:.*' THEN -- keyword
-- keyword
result := types._keywordv(substring(token FROM 2 FOR (char_length(token)-1)));
-- tokenize:
-- takes a string and returns a nested table of token strings
FUNCTION tokenize(str varchar) RETURN tokens IS
- re varchar2(100) := '[[:space:] ,]*(~@|[][{}()''`~@]|"(([\].|[^\"])*)"|;[^' || chr(10) || ']*|[^][[:space:] {}()''"`~@,;]*)';
+ re varchar2(100) := '[[:space:] ,]*(~@|[][{}()''`~@]|"(([\].|[^\"])*)"?|;[^' || chr(10) || ']*|[^][[:space:] {}()''"`~@,;]*)';
tok CLOB;
toks tokens := tokens();
cnt integer;
str := REPLACE(str, '\n', chr(10));
str := REPLACE(str, '\\', chr(92));
result := types.string(M, str);
+ ELSIF REGEXP_LIKE(token, '^".*') THEN -- unclosed string
+ raise_application_error(-20003,
+ 'expected ''"'', got EOF', TRUE);
ELSIF REGEXP_LIKE(token, '^:.*') THEN -- keyword
-- keyword
result := types.keyword(M, SUBSTR(token, 2, LENGTH(token)-1));
token := rdr.peek();
IF token IS NULL THEN
raise_application_error(-20003,
- 'expected ''' || last || '''', TRUE);
+ 'expected ''' || last || ''', got EOF', TRUE);
END IF;
IF token = last THEN EXIT; END IF;
items.EXTEND();
function tokenize {
- $r = [regex]"[\s,]*(~@|[\[\]{}()'``~^@]|`"(?:\\.|[^\\`"])*`"|;.*|[^\s\[\]{}('`"``,;)]*)"
+ $r = [regex]"[\s,]*(~@|[\[\]{}()'``~^@]|`"(?:\\.|[^\\`"])*`"?|;.*|[^\s\[\]{}('`"``,;)]*)"
$r.Matches($args) |
Where-Object { $_.Groups.Item(1).Value.Length -gt 0 -and
$_.Groups.Item(1).Value[0] -ne ";" } |
$s = $s -replace "\\n", "`n"
$s = $s -replace "$([char]0x29e)", "\"
return $s
+ } elseif ($token -match "^`".*") {
+ throw "expected '`"', got EOF"
} elseif ($token -match ":.*") {
return "$([char]0x29e)$($token.substring(1))"
} elseif ($token -eq "true") {
of all the tokens (strings) in it. The following regular expression
(PCRE) will match all mal tokens.
```
-[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)
+[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)
```
* For each match captured within the parenthesis starting at char 6 of the
regular expression a new token will be created.
* ```[\[\]{}()'`~^@]```: Captures any special single character, one of
```[]{}()'`~^@``` (tokenized).
- * `"(?:\\.|[^\\"])*"`: Starts capturing at a double-quote and stops at the
+ * `"(?:\\.|[^\\"])*"?`: Starts capturing at a double-quote and stops at the
next double-quote unless it was proceeded by a backslash in which case it
- includes it until the next double-quote (tokenized).
+ includes it until the next double-quote (tokenized). It will also
+ match unbalanced strings (no ending double-quote) which should be
+ reported as an error.
* `;.*`: Captures any sequence of characters starting with `;` (tokenized).
}
tokenize <- function(str) {
- re <- "[\\s,]*(~@|[\\[\\]\\{\\}\\(\\)'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"|;.*|[^\\s\\[\\]\\{\\}\\('\"`,;\\)]*)"
+ re <- "[\\s,]*(~@|[\\[\\]\\{\\}\\(\\)'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;.*|[^\\s\\[\\]\\{\\}\\('\"`,;\\)]*)"
m <- lapply(regmatches(str, gregexpr(re, str, perl=TRUE)),
function(e) sub("^[\\s,]+", "", e, perl=TRUE))
res <- list()
} else if (re_match("^-?[0-9][0-9.]*$", token)) {
as.double(token)
} else if (substr(token,1,1) == "\"") {
+ if (substr(token, nchar(token), nchar(token)) != "\"") {
+ throw("expected '\"', got EOF")
+ }
gsub("\x7f", "\\\\",
gsub("\\\\n", "\n",
gsub("\\\\\"", "\"",
(define (tokenize str)
(filter-not (lambda (s) (or (equal? s "") (equal? (substring s 0 1) ";")))
- (regexp-match* #px"[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"|;[^\n]*|[^\\s\\[\\]{}('\"`,;)]*)"
+ (regexp-match* #px"[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;[^\n]*|[^\\s\\[\\]{}('\"`,;)]*)"
str #:match-select cadr)))
(define (read_atom rdr)
(string->number token)]
[(regexp-match #px"^\".*\"$" token)
(with-input-from-string token read)]
+ [(regexp-match #px"^\".*$" token)
+ (raise "expected '\"', got EOF")]
[(regexp-match #px"^:" token) (_keyword (substring token 1))]
[(equal? "nil" token) nil]
[(equal? "true" token) #t]
when token == "true" then return new_true()
when token == "false" then return new_false()
when substr(token, 1, 1) == ':' then return new_keyword(parse_keyword(token))
- when substr(token, 1, 1) == '"' then return new_string(parse_string(token))
+ when substr(token, 1, 1) == '"' then do
+ if substr(token, length(token), 1) \== '"' then do
+ err = "expected '" || end_char || "', got EOF"
+ return "ERR"
+ end
+ return new_string(parse_string(token))
+ end
otherwise
return new_symbol(token)
end
return None
def tokenize(str):
- re_str = "[\s,]*(~@|[\[\]{}()'`~^@]|\"(?:[\\\\].|[^\\\\\"])*\"|;.*|[^\s\[\]{}()'\"`@,;]+)"
+ re_str = "[\s,]*(~@|[\[\]{}()'`~^@]|\"(?:[\\\\].|[^\\\\\"])*\"?|;.*|[^\s\[\]{}()'\"`@,;]+)"
if IS_RPYTHON:
tok_re = re_str
else:
## elif re.match(float_re, token): return int(token)
elif token[0] == '"':
end = len(token)-1
- if end < 2:
+ if end == 1:
return MalStr(u"")
+ elif end < 1 or token[end] != '"':
+ types.throw_str("expected '\"', got EOF")
else:
s = unicode(token[1:end])
s = types._replace(u'\\\\', u"\u029e", s)
def tokenize(str)
- re = /[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)/
+ re = /[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)/
return str.scan(re).map{|m| m[0]}.select{ |t|
t != "" && t[0..0] != ";"
}
when /^-?[0-9]+$/ then token.to_i # integer
when /^-?[0-9][0-9.]*$/ then token.to_f # float
when /^".*"$/ then parse_str(token) # string
+ when /^".*$/ then raise "expected '\"', got EOF"
when /^:/ then "\u029e" + token[1..-1] # keyword
when "nil" then nil
when "true" then true
}
def tokenize(str: String): Array[String] = {
- val re = """[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)""".r
+ val re = """[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)""".r
re.findAllMatchIn(str).map{ _.group(1) }
.filter{ s => s != "" && s(0) != ';' }
.toArray
val re_int = """^(-?[0-9]+)$""".r
val re_flt = """^(-?[0-9][0-9.]*)$""".r
val re_str = """^"(.*)"$""".r
+ val re_str_bad = """^"(.*)$""".r
val re_key = """^:(.*)$""".r
return token match {
case re_int(i) => i.toLong // integer
case re_flt(f) => f.toDouble // float
case re_str(s) => parse_str(s) // string
+ case re_str_bad(s) =>
+ throw new Exception("expected '\"', got EOF")
case re_key(k) => "\u029e" + k // keyword
case "nil" => null
case "true" => true
}
def tokenize(str string) List<string> {
- var re = RegExp.new("[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"|;.*|[^\\s\\[\\]{}('\"`,;)]*)", "g")
+ var re = RegExp.new("[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;.*|[^\\s\\[\\]{}('\"`,;)]*)", "g")
var tokens List<string> = []
var match string
while (match = re.exec(str)[1]) != "" {
"|" +
"[\\[\\]{}()`'~^@]" + // Punctuation: Any one of []{}()`'~^@
"|" +
- "\"(?:\\\\.|[^\\\\\"])*\"" + // Quoted string: characters other than \ or ", or any escaped characters
+ "\"(?:\\\\.|[^\\\\\"])*\"?" + // Quoted string: characters other than \ or ", or any escaped characters
"|" +
";.*" + // Comment: semicolon followed by anything
"|" +
"|" +
"(^\".*\"$)" + // String
"|" +
+ "(^\".*$)" + // Invalid/unclosed string
+ "|" +
"(:.*)" + // Keyword
"|" +
"(^[^\"]*$)" // Symbol
return make_false()
} else if have_match(match, at_index: 7) { // String
return make_string(unescape(token))
- } else if have_match(match, at_index: 8) { // Keyword
+ } else if have_match(match, at_index: 8) { // Invalid/unclosed string
+ try throw_error("expected '\"', got EOF")
+ } else if have_match(match, at_index: 9) { // Keyword
return make_keyword(token[token.startIndex.successor() ..< token.endIndex])
- } else if have_match(match, at_index: 9) { // Symbol
+ } else if have_match(match, at_index: 10) { // Symbol
return make_symbol(token)
}
}
if rdr.str[cidx] == "\"" { break }
cidx = rdr.pos
}
- if rdr.pos > rdr.str.endIndex {
+ if rdr.str[rdr.str.index(before: rdr.pos)] != "\"" {
throw MalError.Reader(msg: "Expected '\"', got EOF")
}
let matchStr = rdr.str.substring(with:
}
proc tokenize str {
- set re {[\s,]*(~@|[\[\]\{\}()'`~^@]|\"(?:\\.|[^\\\"])*\"|;.*|[^\s\[\]\{\}('\"`~^@,;)]*)}
+ set re {[\s,]*(~@|[\[\]\{\}()'`~^@]|\"(?:\\.|[^\\\"])*\"?|;.*|[^\s\[\]\{\}('\"`~^@,;)]*)}
set tokens {}
foreach {_ capture} [regexp -line -all -inline $re $str] {
if {[string length $capture] > 0 && [string range $capture 0 0] != ";"} {
^false$ { return $::mal_false }
^: { return [keyword_new [parse_keyword $token]] }
^\".*\"$ { return [string_new [parse_string $token]] }
+ ^\".*$ { error "expected '\"', got EOF" }
default { return [symbol_new $token] }
}
}
;;; These should throw some error with no return value
"abc
-;/.+
+;/.*(EOF|end of input|unbalanced).*
(1 "abc
-;/.+
+;/.*(EOF|end of input|unbalanced).*
(1 "abc"
-;/.+
+;/.*(EOF|end of input|unbalanced).*
;; Testing read of quoting
'1
}
function tokenizer(input: string): string[] {
- const regexp = /[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"|;.*|[^\s\[\]{}('"`,;)]*)/g;
+ const regexp = /[\s,]*(~@|[\[\]{}()'`~^@]|"(?:\\.|[^\\"])*"?|;.*|[^\s\[\]{}('"`,;)]*)/g;
const tokens: string[] = [];
while (true) {
const matches = regexp.exec(input);
return new MalNumber(v);
}
if (token[0] === '"') {
+ if (token.slice(-1) !== '"') {
+ throw new Error("expected '\"', got EOF");
+ }
const v = token.slice(1, token.length - 1)
.replace(/\\(.)/g, (_, c: string) => c == 'n' ? '\n' : c)
return new MalString(v);
Shared Function tokenize(str As String) As List(Of String)
Dim tokens As New List(Of String)
- Dim pattern As String = "[\s ,]*(~@|[\[\]{}()'`~@]|""(?:[\\].|[^\\""])*""|;.*|[^\s \[\]{}()'""`~@,;]*)"
+ Dim pattern As String = "[\s ,]*(~@|[\[\]{}()'`~@]|""(?:[\\].|[^\\""])*""?|;.*|[^\s \[\]{}()'""`~@,;]*)"
Dim regex As New Regex(pattern)
For Each match As Match In regex.Matches(str)
Dim token As String = match.Groups(1).Value
Shared Function read_atom(rdr As Reader) As MalVal
Dim token As String = rdr.get_next()
- Dim pattern As String = "(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^("".*"")$|^:(.*)|(^[^""]*$)"
+ Dim pattern As String = "(^-?[0-9]+$)|(^-?[0-9][0-9.]*$)|(^nil$)|(^true$)|(^false$)|^("".*)|^:(.*)|(^[^""]*$)"
Dim regex As Regex = New Regex(pattern)
Dim match As Match = regex.Match(token)
'Console.WriteLine("token: ^" + token + "$")
return Mal.types.MalFalse
Else If match.Groups(6).Value <> String.Empty Then
Dim str As String = match.Groups(6).Value
+ If str(str.Length-1) <> """" Then
+ throw New ParseError("expected '""', got EOF")
+ End If
return New Mal.types.MalString(
str.Substring(1, str.Length-2) _
.Replace("\\", ChrW(&H029e)) _
deallocate(s);
end procedure unescape_string_token;
- procedure read_atom(r: inout reader_class; result: out mal_val_ptr) is
+ procedure read_atom(r: inout reader_class; result: out mal_val_ptr; err: out mal_val_ptr) is
variable token, s: line;
variable num: integer;
variable ch: character;
s(1 to s'length) := token(2 to token'length);
new_keyword(s, result);
when '"' =>
+ if token(token'length) /= '"' then
+ new_string("expected '""', got EOF", err);
+ result := null;
+ return;
+ end if;
unescape_string_token(token, s);
new_string(s, result);
when others =>
when ']' => new_string("unexcepted ']'", err);
when '{' => read_sequence(mal_hashmap, "}", r, result, err);
when '}' => new_string("unexcepted '}'", err);
- when others => read_atom(r, result);
+ when others => read_atom(r, result, err);
end case;
end procedure read_form;
(local.set $slen ($strlen (i32.add $tok 1)))
(if (i32.ne (i32.load8_u (i32.add $tok $slen)) (CHR "\""))
(then
- ($THROW_STR_0 "expected '\"'")
+ ($THROW_STR_0 "expected '\"', got EOF")
(return 0))
(else
;; unescape backslashes, quotes, and newlines
#include "yeti_regex.i"
require, "types.i"
-TOKENIZER_REGEXP = regcomp("[[:space:],]*(~@|[][{}()'`~@]|\"([\\].|[^\\\"])*\"|;.*|[^][[:space:]{}()'\"`~@,;]*)", newline=1)
+TOKENIZER_REGEXP = regcomp("[[:space:],]*(~@|[][{}()'`~@]|\"([\\].|[^\\\"])*\"?|;.*|[^][[:space:]{}()'\"`~@,;]*)", newline=1)
func tokenize(str)
{
}
NUMBER_REGEXP = regcomp("^-?[0-9]+$")
+STR_REGEXP = regcomp("^\".*\"$")
+STR_BAD_REGEXP = regcomp("^\".*$")
func unescape(s)
{
else if (token == "true") return MAL_TRUE
else if (token == "false") return MAL_FALSE
else if (regmatch(NUMBER_REGEXP, token)) return MalNumber(val=tonum(token))
- else if (strpart(token, 1:1) == "\"") return MalString(val=unescape(token))
+ else if (regmatch(STR_REGEXP, token)) return MalString(val=unescape(token))
+ else if (regmatch(STR_BAD_REGEXP, token)) return MalError(message=("expected '\"', got EOF"))
else if (strpart(token, 1:1) == ":") return MalKeyword(val=strpart(token, 2:))
else return MalSymbol(val=token)
}