permit multiline comments and strings in macros
[bpt/coccinelle.git] / parsing_cocci / lexer_cocci.mll
CommitLineData
f537ebc4 1(*
17ba0788
C
2 * Copyright 2012, INRIA
3 * Julia Lawall, Gilles Muller
4 * Copyright 2010-2011, INRIA, University of Copenhagen
f537ebc4
C
5 * Julia Lawall, Rene Rydhof Hansen, Gilles Muller, Nicolas Palix
6 * Copyright 2005-2009, Ecole des Mines de Nantes, University of Copenhagen
7 * Yoann Padioleau, Julia Lawall, Rene Rydhof Hansen, Henrik Stuart, Gilles Muller, Nicolas Palix
8 * This file is part of Coccinelle.
9 *
10 * Coccinelle is free software: you can redistribute it and/or modify
d6ce1786
C
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation, according to version 2 of the License.
13 *
14 * Coccinelle is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with Coccinelle. If not, see <http://www.gnu.org/licenses/>.
21 *
22 * The authors reserve the right to distribute this or future versions of
23 * Coccinelle under other licenses.
24 *)
25
26
feec80c3 27# 0 "./lexer_cocci.mll"
34e49164
C
28{
29open Parser_cocci_menhir
30module D = Data
31module Ast = Ast_cocci
32module Ast0 = Ast0_cocci
33module P = Parse_aux
d6ce1786 34module FC = Flag_parsing_cocci
34e49164
C
35exception Lexical of string
36let tok = Lexing.lexeme
37
38let line = ref 1
39let logical_line = ref 0
40
41(* ---------------------------------------------------------------------- *)
42(* control codes *)
43
44(* Defined in data.ml
45type line_type = MINUS | OPTMINUS | UNIQUEMINUS | PLUS | CONTEXT | UNIQUE | OPT
46*)
47
48let current_line_type = ref (D.CONTEXT,!line,!logical_line)
49
50let prev_plus = ref false
51let line_start = ref 0 (* offset of the beginning of the line *)
52let get_current_line_type lexbuf =
53 let (c,l,ll) = !current_line_type in
54 let lex_start = Lexing.lexeme_start lexbuf in
55 let preceeding_spaces =
56 if !line_start < 0 then 0 else lex_start - !line_start in
708f4980 57 (*line_start := -1;*)
951c7801 58 prev_plus := (c = D.PLUS) or (c = D.PLUSPLUS);
8f657093 59 (c,l,ll,lex_start,preceeding_spaces,[],[],[])
34e49164
C
60let current_line_started = ref false
61let col_zero = ref true
62
97111a47
C
63let contextify (c,l,ll,lex_start,preceeding_spaces,bef,aft,pos) =
64 (D.CONTEXT,l,ll,lex_start,preceeding_spaces,bef,aft,pos)
65
34e49164
C
66let reset_line lexbuf =
67 line := !line + 1;
68 current_line_type := (D.CONTEXT,!line,!logical_line);
69 current_line_started := false;
70 col_zero := true;
71 line_start := Lexing.lexeme_start lexbuf + 1
72
73let started_line = ref (-1)
74
75let start_line seen_char =
76 current_line_started := true;
77 col_zero := false;
78 (if seen_char && not(!line = !started_line)
79 then
80 begin
81 started_line := !line;
82 logical_line := !logical_line + 1
83 end)
84
85let pass_zero _ = col_zero := false
86
87let lexerr s1 s2 = raise (Lexical (Printf.sprintf "%s%s" s1 s2))
88
d6ce1786
C
89let opt_reverse_token token =
90 if !FC.interpret_inverted
91 then match token with
92 D.MINUS -> D.PLUSPLUS (* maybe too liberal *)
93 | D.OPTMINUS -> lexerr "cannot invert token ?- (an optional minus line), which is needed for reversing the patch" ""
94 | D.UNIQUEMINUS -> D.PLUS
95 | D.PLUS -> D.MINUS
96 | D.PLUSPLUS -> D.MINUS (* may not be sufficient *)
97 | _ -> token
98 else token
99
34e49164 100let add_current_line_type x =
d6ce1786 101 match (opt_reverse_token x,!current_line_type) with
34e49164
C
102 (D.MINUS,(D.CONTEXT,ln,lln)) ->
103 current_line_type := (D.MINUS,ln,lln)
104 | (D.MINUS,(D.UNIQUE,ln,lln)) ->
105 current_line_type := (D.UNIQUEMINUS,ln,lln)
106 | (D.MINUS,(D.OPT,ln,lln)) ->
107 current_line_type := (D.OPTMINUS,ln,lln)
108 | (D.PLUS,(D.CONTEXT,ln,lln)) ->
109 current_line_type := (D.PLUS,ln,lln)
951c7801
C
110 | (D.PLUSPLUS,(D.CONTEXT,ln,lln)) ->
111 current_line_type := (D.PLUSPLUS,ln,lln)
34e49164
C
112 | (D.UNIQUE,(D.CONTEXT,ln,lln)) ->
113 current_line_type := (D.UNIQUE,ln,lln)
114 | (D.OPT,(D.CONTEXT,ln,lln)) ->
115 current_line_type := (D.OPT,ln,lln)
116 | _ -> lexerr "invalid control character combination" ""
117
118let check_minus_context_linetype s =
119 match !current_line_type with
951c7801 120 (D.PLUS,_,_) | (D.PLUSPLUS,_,_) -> lexerr "invalid in a + context: " s
34e49164
C
121 | _ -> ()
122
123let check_context_linetype s =
124 match !current_line_type with
125 (D.CONTEXT,_,_) -> ()
126 | _ -> lexerr "invalid in a nonempty context: " s
127
128let check_plus_linetype s =
129 match !current_line_type with
951c7801 130 (D.PLUS,_,_) | (D.PLUSPLUS,_,_) -> ()
34e49164
C
131 | _ -> lexerr "invalid in a non + context: " s
132
133let check_arity_context_linetype s =
134 match !current_line_type with
951c7801
C
135 (D.CONTEXT,_,_) | (D.PLUS,_,_) | (D.PLUSPLUS,_,_)
136 | (D.UNIQUE,_,_) | (D.OPT,_,_) -> ()
34e49164
C
137 | _ -> lexerr "invalid in a nonempty context: " s
138
aa721442
C
139let check_comment s =
140 if not !current_line_started
141 then lexerr "+ expected at the beginning of the line" s
142
34e49164
C
143let process_include start finish str =
144 (match !current_line_type with
951c7801 145 (D.PLUS,_,_) | (D.PLUSPLUS,_,_) ->
34e49164
C
146 (try
147 let _ = Str.search_forward (Str.regexp "\\.\\.\\.") str start in
148 lexerr "... not allowed in + include" ""
149 with Not_found -> ())
150 | _ -> ());
151 String.sub str (start + 1) (finish - start - 1)
152
153(* ---------------------------------------------------------------------- *)
154type pm = PATCH | MATCH | UNKNOWN
155
156let pm = ref UNKNOWN
157
158let patch_or_match = function
159 PATCH ->
7f004419
C
160 if not !D.ignore_patch_or_match
161 then
162 (match !pm with
163 MATCH ->
164 lexerr "- or + not allowed in the first column for a match" ""
165 | PATCH -> ()
166 | UNKNOWN -> Flag.sgrep_mode2 := false; pm := PATCH)
34e49164 167 | MATCH ->
7f004419
C
168 if not !D.ignore_patch_or_match
169 then
170 (match !pm with
171 PATCH -> lexerr "* not allowed in the first column for a patch" ""
172 | MATCH -> ()
173 | UNKNOWN -> Flag.sgrep_mode2 := true; pm := MATCH)
34e49164
C
174 | _ -> failwith "unexpected argument"
175
176(* ---------------------------------------------------------------------- *)
177(* identifiers, including metavariables *)
178
179let metavariables = (Hashtbl.create(100) : (string, D.clt -> token) Hashtbl.t)
180
181let all_metavariables =
182 (Hashtbl.create(100) : (string,(string * (D.clt -> token)) list) Hashtbl.t)
183
184let type_names = (Hashtbl.create(100) : (string, D.clt -> token) Hashtbl.t)
185
186let declarer_names = (Hashtbl.create(100) : (string, D.clt -> token) Hashtbl.t)
187
188let iterator_names = (Hashtbl.create(100) : (string, D.clt -> token) Hashtbl.t)
189
97111a47
C
190let symbol_names = (Hashtbl.create(15) : (string, D.clt -> token) Hashtbl.t)
191
34e49164
C
192let rule_names = (Hashtbl.create(100) : (string, unit) Hashtbl.t)
193
194let check_var s linetype =
195 let fail _ =
196 if (!Data.in_prolog || !Data.in_rule_name) &&
197 Str.string_match (Str.regexp "<.*>") s 0
198 then TPathIsoFile s
199 else
200 try (Hashtbl.find metavariables s) linetype
201 with Not_found ->
202 (try (Hashtbl.find type_names s) linetype
203 with Not_found ->
204 (try (Hashtbl.find declarer_names s) linetype
faf9a90c 205 with Not_found ->
34e49164 206 (try (Hashtbl.find iterator_names s) linetype
97111a47
C
207 with Not_found ->
208 (try (Hashtbl.find symbol_names s) linetype
209 with Not_found ->
210 TIdent (s,linetype))))) in
34e49164
C
211 if !Data.in_meta or !Data.in_rule_name
212 then (try Hashtbl.find rule_names s; TRuleName s with Not_found -> fail())
213 else fail()
214
215let id_tokens lexbuf =
216 let s = tok lexbuf in
217 let linetype = get_current_line_type lexbuf in
218 let in_rule_name = !Data.in_rule_name in
978fd7e5 219 let in_meta = !Data.in_meta && not !Data.saw_struct in
34e49164
C
220 let in_iso = !Data.in_iso in
221 let in_prolog = !Data.in_prolog in
feec80c3
C
222 (if s = "identifer" && in_meta
223 then Common.pr2 "Warning: should identifer be identifier?");
34e49164 224 match s with
b23ff9c7
C
225 "metavariable" when in_meta -> check_arity_context_linetype s; TMetavariable
226 | "identifier" when in_meta -> check_arity_context_linetype s; TIdentifier
34e49164
C
227 | "type" when in_meta -> check_arity_context_linetype s; TType
228 | "parameter" when in_meta -> check_arity_context_linetype s; TParameter
229 | "constant" when in_meta -> check_arity_context_linetype s; TConstant
faf9a90c
C
230 | "generated" when in_rule_name && not (!Flag.make_hrule = None) ->
231 check_arity_context_linetype s; TGenerated
34e49164
C
232 | "expression" when in_meta || in_rule_name ->
233 check_arity_context_linetype s; TExpression
413ffc02
C
234 | "declaration" when in_meta || in_rule_name ->
235 check_arity_context_linetype s; TDeclaration
236 | "field" when in_meta || in_rule_name ->
237 check_arity_context_linetype s; TField
113803cf
C
238 | "initialiser" when in_meta || in_rule_name ->
239 check_arity_context_linetype s; TInitialiser
240 | "initializer" when in_meta || in_rule_name ->
241 check_arity_context_linetype s; TInitialiser
34e49164
C
242 | "idexpression" when in_meta ->
243 check_arity_context_linetype s; TIdExpression
244 | "statement" when in_meta -> check_arity_context_linetype s; TStatement
245 | "function" when in_meta -> check_arity_context_linetype s; TFunction
246 | "local" when in_meta -> check_arity_context_linetype s; TLocal
247 | "list" when in_meta -> check_arity_context_linetype s; Tlist
248 | "fresh" when in_meta -> check_arity_context_linetype s; TFresh
249 | "typedef" when in_meta -> check_arity_context_linetype s; TTypedef
250 | "declarer" when in_meta -> check_arity_context_linetype s; TDeclarer
251 | "iterator" when in_meta -> check_arity_context_linetype s; TIterator
252 | "name" when in_meta -> check_arity_context_linetype s; TName
253 | "position" when in_meta -> check_arity_context_linetype s; TPosition
1b9ae606 254 | "analysis" when in_meta -> check_arity_context_linetype s; TAnalysis
34e49164
C
255 | "any" when in_meta -> check_arity_context_linetype s; TPosAny
256 | "pure" when in_meta && in_iso ->
257 check_arity_context_linetype s; TPure
258 | "context" when in_meta && in_iso ->
259 check_arity_context_linetype s; TContext
260 | "error" when in_meta -> check_arity_context_linetype s; TError
261 | "words" when in_meta -> check_context_linetype s; TWords
97111a47 262 | "symbol" when in_meta -> check_arity_context_linetype s; TSymbol
34e49164
C
263
264 | "using" when in_rule_name || in_prolog -> check_context_linetype s; TUsing
ae4735db
C
265 | "virtual" when in_prolog or in_rule_name or in_meta ->
266 (* don't want to allow virtual as a rule name *)
267 check_context_linetype s; TVirtual
34e49164
C
268 | "disable" when in_rule_name -> check_context_linetype s; TDisable
269 | "extends" when in_rule_name -> check_context_linetype s; TExtends
270 | "depends" when in_rule_name -> check_context_linetype s; TDepends
271 | "on" when in_rule_name -> check_context_linetype s; TOn
272 | "ever" when in_rule_name -> check_context_linetype s; TEver
273 | "never" when in_rule_name -> check_context_linetype s; TNever
978fd7e5 274 (* exists and forall for when are reparsed in parse_cocci.ml *)
34e49164
C
275 | "exists" when in_rule_name -> check_context_linetype s; TExists
276 | "forall" when in_rule_name -> check_context_linetype s; TForall
b1b2de81
C
277 | "script" when in_rule_name -> check_context_linetype s; TScript
278 | "initialize" when in_rule_name -> check_context_linetype s; TInitialize
279 | "finalize" when in_rule_name -> check_context_linetype s; TFinalize
34e49164
C
280
281 | "char" -> Tchar linetype
282 | "short" -> Tshort linetype
283 | "int" -> Tint linetype
284 | "double" -> Tdouble linetype
285 | "float" -> Tfloat linetype
286 | "long" -> Tlong linetype
287 | "void" -> Tvoid linetype
1eddfd50
C
288 | "size_t" -> Tsize_t linetype
289 | "ssize_t" -> Tssize_t linetype
290 | "ptrdiff_t" -> Tptrdiff_t linetype
978fd7e5
C
291 (* in_meta is only for the first keyword; drop it now to allow any type
292 name *)
293 | "struct" -> Data.saw_struct := true; Tstruct linetype
294 | "union" -> Data.saw_struct := true; Tunion linetype
295 | "enum" -> Data.saw_struct := true; Tenum linetype
34e49164
C
296 | "unsigned" -> Tunsigned linetype
297 | "signed" -> Tsigned linetype
faf9a90c 298
34e49164
C
299 | "auto" -> Tauto linetype
300 | "register" -> Tregister linetype
301 | "extern" -> Textern linetype
302 | "static" -> Tstatic linetype
303 | "inline" -> Tinline linetype
304 | "typedef" -> Ttypedef linetype
305
306 | "const" -> Tconst linetype
307 | "volatile" -> Tvolatile linetype
308
309 | "if" -> TIf linetype
310 | "else" -> TElse linetype
311 | "while" -> TWhile linetype
312 | "do" -> TDo linetype
313 | "for" -> TFor linetype
314 | "switch" -> TSwitch linetype
315 | "case" -> TCase linetype
316 | "default" -> TDefault linetype
317 | "return" -> TReturn linetype
318 | "break" -> TBreak linetype
319 | "continue" -> TContinue linetype
320 | "goto" -> TGoto linetype
321
322 | "sizeof" -> TSizeof linetype
323
aba5c457
C
324 | "Expression" when !Data.in_iso -> TIsoExpression
325 | "ArgExpression" when !Data.in_iso -> TIsoArgExpression
326 | "TestExpression" when !Data.in_iso -> TIsoTestExpression
327 | "ToTestExpression" when !Data.in_iso -> TIsoToTestExpression
328 | "Statement" when !Data.in_iso -> TIsoStatement
329 | "Declaration" when !Data.in_iso -> TIsoDeclaration
330 | "Type" when !Data.in_iso -> TIsoType
331 | "TopLevel" when !Data.in_iso -> TIsoTopLevel
332
333 | "_" when !Data.in_meta -> TUnderscore
34e49164
C
334
335 | s -> check_var s linetype
336
337let mkassign op lexbuf =
338 TAssign (Ast.OpAssign op, (get_current_line_type lexbuf))
339
340let init _ =
341 line := 1;
342 logical_line := 0;
343 prev_plus := false;
344 line_start := 0;
345 current_line_started := false;
c3e37e97 346 current_line_type := (D.CONTEXT,0,0);
34e49164
C
347 col_zero := true;
348 pm := UNKNOWN;
349 Data.in_rule_name := false;
350 Data.in_meta := false;
351 Data.in_prolog := false;
978fd7e5 352 Data.saw_struct := false;
34e49164
C
353 Data.inheritable_positions := [];
354 Hashtbl.clear all_metavariables;
355 Hashtbl.clear Data.all_metadecls;
356 Hashtbl.clear metavariables;
357 Hashtbl.clear type_names;
358 Hashtbl.clear rule_names;
708f4980
C
359 Hashtbl.clear iterator_names;
360 Hashtbl.clear declarer_names;
97111a47 361 Hashtbl.clear symbol_names;
34e49164 362 let get_name (_,x) = x in
b23ff9c7
C
363 Data.add_meta_meta :=
364 (fun name pure ->
365 let fn clt = TMeta(name,pure,clt) in
366 Hashtbl.replace metavariables (get_name name) fn);
34e49164
C
367 Data.add_id_meta :=
368 (fun name constraints pure ->
8babbc8f 369 let fn clt = TMetaId(name,constraints,Ast.NoVal,pure,clt) in
34e49164 370 Hashtbl.replace metavariables (get_name name) fn);
ae4735db
C
371 Data.add_virt_id_meta_found :=
372 (fun name vl ->
373 let fn clt = TIdent(vl,clt) in
374 Hashtbl.replace metavariables name fn);
375 Data.add_virt_id_meta_not_found :=
376 (fun name pure ->
8babbc8f 377 let fn clt = TMetaId(name,Ast.IdNoConstraint,Ast.NoVal,pure,clt) in
ae4735db 378 Hashtbl.replace metavariables (get_name name) fn);
b1b2de81 379 Data.add_fresh_id_meta :=
8babbc8f
C
380 (fun name seed ->
381 let fn clt = TMetaId(name,Ast.IdNoConstraint,seed,Ast0.Impure,clt) in
b1b2de81 382 Hashtbl.replace metavariables (get_name name) fn);
34e49164
C
383 Data.add_type_meta :=
384 (fun name pure ->
385 let fn clt = TMetaType(name,pure,clt) in
386 Hashtbl.replace metavariables (get_name name) fn);
113803cf
C
387 Data.add_init_meta :=
388 (fun name pure ->
389 let fn clt = TMetaInit(name,pure,clt) in
390 Hashtbl.replace metavariables (get_name name) fn);
8f657093
C
391 Data.add_initlist_meta :=
392 (function name -> function lenname -> function pure ->
393 let fn clt = TMetaInitList(name,lenname,pure,clt) in
394 Hashtbl.replace metavariables (get_name name) fn);
34e49164
C
395 Data.add_param_meta :=
396 (function name -> function pure ->
397 let fn clt = TMetaParam(name,pure,clt) in
398 Hashtbl.replace metavariables (get_name name) fn);
399 Data.add_paramlist_meta :=
400 (function name -> function lenname -> function pure ->
401 let fn clt = TMetaParamList(name,lenname,pure,clt) in
402 Hashtbl.replace metavariables (get_name name) fn);
403 Data.add_const_meta :=
404 (fun tyopt name constraints pure ->
405 let fn clt = TMetaConst(name,constraints,pure,tyopt,clt) in
406 Hashtbl.replace metavariables (get_name name) fn);
407 Data.add_err_meta :=
408 (fun name constraints pure ->
409 let fn clt = TMetaErr(name,constraints,pure,clt) in
410 Hashtbl.replace metavariables (get_name name) fn);
411 Data.add_exp_meta :=
412 (fun tyopt name constraints pure ->
413 let fn clt = TMetaExp(name,constraints,pure,tyopt,clt) in
414 Hashtbl.replace metavariables (get_name name) fn);
415 Data.add_idexp_meta :=
416 (fun tyopt name constraints pure ->
417 let fn clt = TMetaIdExp(name,constraints,pure,tyopt,clt) in
418 Hashtbl.replace metavariables (get_name name) fn);
419 Data.add_local_idexp_meta :=
420 (fun tyopt name constraints pure ->
421 let fn clt = TMetaLocalIdExp(name,constraints,pure,tyopt,clt) in
422 Hashtbl.replace metavariables (get_name name) fn);
423 Data.add_explist_meta :=
424 (function name -> function lenname -> function pure ->
425 let fn clt = TMetaExpList(name,lenname,pure,clt) in
426 Hashtbl.replace metavariables (get_name name) fn);
413ffc02
C
427 Data.add_decl_meta :=
428 (function name -> function pure ->
429 let fn clt = TMetaDecl(name,pure,clt) in
430 Hashtbl.replace metavariables (get_name name) fn);
431 Data.add_field_meta :=
432 (function name -> function pure ->
433 let fn clt = TMetaField(name,pure,clt) in
434 Hashtbl.replace metavariables (get_name name) fn);
190f1acf
C
435 Data.add_field_list_meta :=
436 (function name -> function lenname -> function pure ->
437 let fn clt = TMetaFieldList(name,lenname,pure,clt) in
438 Hashtbl.replace metavariables (get_name name) fn);
34e49164
C
439 Data.add_stm_meta :=
440 (function name -> function pure ->
441 let fn clt = TMetaStm(name,pure,clt) in
442 Hashtbl.replace metavariables (get_name name) fn);
443 Data.add_stmlist_meta :=
444 (function name -> function pure ->
445 let fn clt = TMetaStmList(name,pure,clt) in
446 Hashtbl.replace metavariables (get_name name) fn);
447 Data.add_func_meta :=
448 (fun name constraints pure ->
449 let fn clt = TMetaFunc(name,constraints,pure,clt) in
450 Hashtbl.replace metavariables (get_name name) fn);
451 Data.add_local_func_meta :=
452 (fun name constraints pure ->
453 let fn clt = TMetaLocalFunc(name,constraints,pure,clt) in
454 Hashtbl.replace metavariables (get_name name) fn);
455 Data.add_iterator_meta :=
456 (fun name constraints pure ->
457 let fn clt = TMetaIterator(name,constraints,pure,clt) in
458 Hashtbl.replace metavariables (get_name name) fn);
459 Data.add_declarer_meta :=
460 (fun name constraints pure ->
461 let fn clt = TMetaDeclarer(name,constraints,pure,clt) in
462 Hashtbl.replace metavariables (get_name name) fn);
463 Data.add_pos_meta :=
464 (fun name constraints any ->
465 let fn ((d,ln,_,_,_,_,_,_) as clt) =
466 (if d = Data.PLUS
467 then
468 failwith
469 (Printf.sprintf "%d: positions only allowed in minus code" ln));
470 TMetaPos(name,constraints,any,clt) in
471 Hashtbl.replace metavariables (get_name name) fn);
472 Data.add_type_name :=
473 (function name ->
474 let fn clt = TTypeId(name,clt) in
475 Hashtbl.replace type_names name fn);
476 Data.add_declarer_name :=
477 (function name ->
478 let fn clt = TDeclarerId(name,clt) in
479 Hashtbl.replace declarer_names name fn);
480 Data.add_iterator_name :=
481 (function name ->
482 let fn clt = TIteratorId(name,clt) in
483 Hashtbl.replace iterator_names name fn);
97111a47
C
484 Data.add_symbol_meta :=
485 (function name ->
486 let fn clt = TSymId (name,clt) in
487 Hashtbl.replace symbol_names name fn);
34e49164
C
488 Data.init_rule := (function _ -> Hashtbl.clear metavariables);
489 Data.install_bindings :=
490 (function parent ->
491 List.iter (function (name,fn) -> Hashtbl.add metavariables name fn)
492 (Hashtbl.find all_metavariables parent))
493
5636bb2c
C
494(* the following is needed to properly tokenize include files. Because an
495include file is included after seeing a @, so current_line_started is true.
496Current_line_started is not important for parsing the name of a rule, so we
497don't have to reset this value to true after parsing an included file. *)
498let include_init _ =
499 current_line_started := false
500
34e49164
C
501let drop_spaces s =
502 let len = String.length s in
503 let rec loop n =
504 if n = len
505 then n
506 else
507 if List.mem (String.get s n) [' ';'\t']
508 then loop (n+1)
509 else n in
510 let start = loop 0 in
511 String.sub s start (len - start)
512}
513
514(* ---------------------------------------------------------------------- *)
515(* tokens *)
516
517let letter = ['A'-'Z' 'a'-'z' '_']
518let digit = ['0'-'9']
519
520let dec = ['0'-'9']
521let oct = ['0'-'7']
522let hex = ['0'-'9' 'a'-'f' 'A'-'F']
523
524let decimal = ('0' | (['1'-'9'] dec*))
525let octal = ['0'] oct+
faf9a90c 526let hexa = ("0x" |"0X") hex+
34e49164
C
527
528let pent = dec+
529let pfract = dec+
530let sign = ['-' '+']
531let exp = ['e''E'] sign? dec+
532let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
533
534
535rule token = parse
c3e37e97
C
536 | [' ' '\t']* ['\n' '\r' '\011' '\012']
537 { let cls = !current_line_started in
97111a47 538
c3e37e97
C
539 if not cls
540 then
541 begin
542 match !current_line_type with
543 (D.PLUS,_,_) | (D.PLUSPLUS,_,_) ->
544 let info = get_current_line_type lexbuf in
545 reset_line lexbuf;
546 TPragma (Ast.Noindent "", info)
547 | _ -> reset_line lexbuf; token lexbuf
548 end
549 else (reset_line lexbuf; token lexbuf) }
550
551 | [' ' '\t' ]+ { start_line false; token lexbuf }
552
8f657093 553 | [' ' '\t' ]* (("//" [^ '\n']*) as after) {
c3e37e97
C
554 match !current_line_type with
555 (D.PLUS,_,_) | (D.PLUSPLUS,_,_) ->
8f657093
C
556 let str =
557 if !current_line_started
558 then (tok lexbuf)
559 else after in
190f1acf 560 start_line true;
8f657093 561 TPragma (Ast.Indent str, get_current_line_type lexbuf)
c3e37e97 562 | _ -> start_line false; token lexbuf }
34e49164 563
190f1acf
C
564 | "__attribute__" [' ' '\t']* "((" _* "))"
565 { match !current_line_type with
566 (D.PLUS,_,_) | (D.PLUSPLUS,_,_) ->
567 start_line true;
568 TPragma (Ast.Space (tok lexbuf), get_current_line_type lexbuf)
569 | _ -> failwith "attributes only allowedin + code" }
570
34e49164
C
571 | "@@" { start_line true; TArobArob }
572 | "@" { pass_zero();
573 if !Data.in_rule_name or not !current_line_started
574 then (start_line true; TArob)
17ba0788
C
575 else (check_minus_context_linetype "@";
576 TPArob (get_current_line_type lexbuf)) }
34e49164 577
f3c4ece6 578 | "=~" { start_line true; TTildeEq (get_current_line_type lexbuf) }
7fe62b65 579 | "!~" { start_line true; TTildeExclEq (get_current_line_type lexbuf) }
34e49164
C
580 | "WHEN" | "when"
581 { start_line true; check_minus_context_linetype (tok lexbuf);
582 TWhen (get_current_line_type lexbuf) }
583
584 | "..."
585 { start_line true; check_minus_context_linetype (tok lexbuf);
586 TEllipsis (get_current_line_type lexbuf) }
587(*
588 | "ooo"
589 { start_line true; check_minus_context_linetype (tok lexbuf);
590 TCircles (get_current_line_type lexbuf) }
591
592 | "***"
593 { start_line true; check_minus_context_linetype (tok lexbuf);
594 TStars (get_current_line_type lexbuf) }
595*)
596 | "<..." { start_line true; check_context_linetype (tok lexbuf);
597 TOEllipsis (get_current_line_type lexbuf) }
598 | "...>" { start_line true; check_context_linetype (tok lexbuf);
599 TCEllipsis (get_current_line_type lexbuf) }
5636bb2c 600 | "<+..." { start_line true; check_minus_context_linetype (tok lexbuf);
34e49164 601 TPOEllipsis (get_current_line_type lexbuf) }
5636bb2c 602 | "...+>" { start_line true; check_minus_context_linetype (tok lexbuf);
34e49164
C
603 TPCEllipsis (get_current_line_type lexbuf) }
604(*
605 | "<ooo" { start_line true; check_context_linetype (tok lexbuf);
606 TOCircles (get_current_line_type lexbuf) }
607 | "ooo>" { start_line true; check_context_linetype (tok lexbuf);
608 TCCircles (get_current_line_type lexbuf) }
609
610 | "<***" { start_line true; check_context_linetype (tok lexbuf);
611 TOStars (get_current_line_type lexbuf) }
612 | "***>" { start_line true; check_context_linetype (tok lexbuf);
613 TCStars (get_current_line_type lexbuf) }
614*)
615 | "-" { pass_zero();
616 if !current_line_started
617 then (start_line true; TMinus (get_current_line_type lexbuf))
618 else (patch_or_match PATCH;
619 add_current_line_type D.MINUS; token lexbuf) }
620 | "+" { pass_zero();
621 if !current_line_started
622 then (start_line true; TPlus (get_current_line_type lexbuf))
623 else if !Data.in_meta
624 then TPlus0
625 else (patch_or_match PATCH;
626 add_current_line_type D.PLUS; token lexbuf) }
627 | "?" { pass_zero();
628 if !current_line_started
629 then (start_line true; TWhy (get_current_line_type lexbuf))
630 else if !Data.in_meta
631 then TWhy0
632 else (add_current_line_type D.OPT; token lexbuf) }
633 | "!" { pass_zero();
634 if !current_line_started
635 then (start_line true; TBang (get_current_line_type lexbuf))
636 else if !Data.in_meta
637 then TBang0
638 else (add_current_line_type D.UNIQUE; token lexbuf) }
aba5c457 639 | "(" { if !Data.in_meta or not !col_zero
34e49164
C
640 then (start_line true; TOPar (get_current_line_type lexbuf))
641 else
642 (start_line true; check_context_linetype (tok lexbuf);
643 TOPar0 (get_current_line_type lexbuf))}
97111a47
C
644 | "\\(" { start_line true;
645 TOPar0 (contextify(get_current_line_type lexbuf)) }
34e49164
C
646 | "|" { if not (!col_zero)
647 then (start_line true; TOr(get_current_line_type lexbuf))
648 else (start_line true;
649 check_context_linetype (tok lexbuf);
650 TMid0 (get_current_line_type lexbuf))}
97111a47
C
651 | "\\|" { start_line true;
652 TMid0 (contextify(get_current_line_type lexbuf)) }
34e49164
C
653 | ")" { if not !col_zero
654 then (start_line true; TCPar (get_current_line_type lexbuf))
655 else
656 (start_line true; check_context_linetype (tok lexbuf);
657 TCPar0 (get_current_line_type lexbuf))}
97111a47
C
658 | "\\)" { start_line true;
659 TCPar0 (contextify(get_current_line_type lexbuf)) }
34e49164
C
660
661 | '[' { start_line true; TOCro (get_current_line_type lexbuf) }
662 | ']' { start_line true; TCCro (get_current_line_type lexbuf) }
663 | '{' { start_line true; TOBrace (get_current_line_type lexbuf) }
664 | '}' { start_line true; TCBrace (get_current_line_type lexbuf) }
665
666 | "->" { start_line true; TPtrOp (get_current_line_type lexbuf) }
667 | '.' { start_line true; TDot (get_current_line_type lexbuf) }
668 | ',' { start_line true; TComma (get_current_line_type lexbuf) }
669 | ";" { start_line true;
670 if !Data.in_meta
671 then TMPtVirg (* works better with tokens_all *)
672 else TPtVirg (get_current_line_type lexbuf) }
673
faf9a90c 674
34e49164
C
675 | '*' { pass_zero();
676 if !current_line_started
677 then
678 (start_line true; TMul (get_current_line_type lexbuf))
679 else
680 (patch_or_match MATCH;
681 add_current_line_type D.MINUS; token lexbuf) }
682 | '/' { start_line true;
faf9a90c 683 TDmOp (Ast.Div,get_current_line_type lexbuf) }
1b9ae606
C
684 | "<?" { start_line true;
685 TDmOp (Ast.Min,get_current_line_type lexbuf) }
686 | ">?" { start_line true;
687 TDmOp (Ast.Max,get_current_line_type lexbuf) }
34e49164 688 | '%' { start_line true;
faf9a90c
C
689 TDmOp (Ast.Mod,get_current_line_type lexbuf) }
690 | '~' { start_line true; TTilde (get_current_line_type lexbuf) }
691
951c7801
C
692 | "++" { pass_zero();
693 if !current_line_started
694 then
695 (start_line true; TInc (get_current_line_type lexbuf))
696 else (patch_or_match PATCH;
697 add_current_line_type D.PLUSPLUS; token lexbuf) }
34e49164 698 | "--" { start_line true; TDec (get_current_line_type lexbuf) }
faf9a90c
C
699
700 | "=" { start_line true; TEq (get_current_line_type lexbuf) }
701
34e49164
C
702 | "-=" { start_line true; mkassign Ast.Minus lexbuf }
703 | "+=" { start_line true; mkassign Ast.Plus lexbuf }
faf9a90c 704
34e49164
C
705 | "*=" { start_line true; mkassign Ast.Mul lexbuf }
706 | "/=" { start_line true; mkassign Ast.Div lexbuf }
707 | "%=" { start_line true; mkassign Ast.Mod lexbuf }
faf9a90c 708
34e49164
C
709 | "&=" { start_line true; mkassign Ast.And lexbuf }
710 | "|=" { start_line true; mkassign Ast.Or lexbuf }
711 | "^=" { start_line true; mkassign Ast.Xor lexbuf }
1b9ae606
C
712 | ">?=" { start_line true; mkassign Ast.Max lexbuf }
713 | "<?=" { start_line true; mkassign Ast.Min lexbuf }
faf9a90c 714
34e49164
C
715 | "<<=" { start_line true; mkassign Ast.DecLeft lexbuf }
716 | ">>=" { start_line true; mkassign Ast.DecRight lexbuf }
717
718 | ":" { start_line true; TDotDot (get_current_line_type lexbuf) }
faf9a90c 719
951c7801
C
720 | "==" { start_line true; TEqEq (get_current_line_type lexbuf) }
721 | "!=" { start_line true; TNotEq (get_current_line_type lexbuf) }
34e49164
C
722 | ">=" { start_line true;
723 TLogOp(Ast.SupEq,get_current_line_type lexbuf) }
724 | "<=" { start_line true;
5636bb2c
C
725 if !Data.in_meta
726 then TSub(get_current_line_type lexbuf)
727 else TLogOp(Ast.InfEq,get_current_line_type lexbuf) }
34e49164 728 | "<" { start_line true;
faf9a90c 729 TLogOp(Ast.Inf,get_current_line_type lexbuf) }
34e49164
C
730 | ">" { start_line true;
731 TLogOp(Ast.Sup,get_current_line_type lexbuf) }
faf9a90c
C
732
733 | "&&" { start_line true; TAndLog (get_current_line_type lexbuf) }
34e49164 734 | "||" { start_line true; TOrLog (get_current_line_type lexbuf) }
faf9a90c 735
34e49164 736 | ">>" { start_line true;
413ffc02 737 TShROp(Ast.DecRight,get_current_line_type lexbuf) }
34e49164 738 | "<<" { start_line true;
413ffc02 739 TShLOp(Ast.DecLeft,get_current_line_type lexbuf) }
faf9a90c 740
34e49164
C
741 | "&" { start_line true; TAnd (get_current_line_type lexbuf) }
742 | "^" { start_line true; TXor(get_current_line_type lexbuf) }
743
978fd7e5 744 | "##" { start_line true; TCppConcatOp }
3a314143
C
745 | (( ("#" [' ' '\t']* "undef" [' ' '\t']+)) as def)
746 ( (letter (letter |digit)*) as ident)
747 { start_line true;
748 let (arity,line,lline,offset,col,strbef,straft,pos) as lt =
749 get_current_line_type lexbuf in
750 let off = String.length def in
751 (* -1 in the code below because the ident is not at the line start *)
752 TUndef
753 (lt,
754 check_var ident
8f657093 755 (arity,line,lline,offset+off,col+off,[],[],[])) }
708f4980 756 | (( ("#" [' ' '\t']* "define" [' ' '\t']+)) as def)
faf9a90c 757 ( (letter (letter |digit)*) as ident)
34e49164
C
758 { start_line true;
759 let (arity,line,lline,offset,col,strbef,straft,pos) as lt =
760 get_current_line_type lexbuf in
708f4980 761 let off = String.length def in
34e49164
C
762 (* -1 in the code below because the ident is not at the line start *)
763 TDefine
764 (lt,
765 check_var ident
8f657093 766 (arity,line,lline,offset+off,col+off,[],[],[])) }
708f4980 767 | (( ("#" [' ' '\t']* "define" [' ' '\t']+)) as def)
faf9a90c 768 ( (letter (letter | digit)*) as ident)
34e49164
C
769 '('
770 { start_line true;
771 let (arity,line,lline,offset,col,strbef,straft,pos) as lt =
772 get_current_line_type lexbuf in
708f4980 773 let off = String.length def in
34e49164
C
774 TDefineParam
775 (lt,
776 check_var ident
777 (* why pos here but not above? *)
708f4980
C
778 (arity,line,lline,offset+off,col+off,strbef,straft,pos),
779 offset + off + (String.length ident),
780 col + off + (String.length ident)) }
abad11c5 781 | "#" [' ' '\t']* "include" [' ' '\t']* '\"' [^ '\"']+ '\"'
34e49164
C
782 { TIncludeL
783 (let str = tok lexbuf in
abad11c5
C
784 let start = String.index str '\"' in
785 let finish = String.rindex str '\"' in
34e49164
C
786 start_line true;
787 (process_include start finish str,get_current_line_type lexbuf)) }
788 | "#" [' ' '\t']* "include" [' ' '\t']* '<' [^ '>']+ '>'
789 { TIncludeNL
790 (let str = tok lexbuf in
791 let start = String.index str '<' in
792 let finish = String.rindex str '>' in
793 start_line true;
794 (process_include start finish str,get_current_line_type lexbuf)) }
795 | "#" [' ' '\t']* "if" [^'\n']*
796 | "#" [' ' '\t']* "ifdef" [^'\n']*
797 | "#" [' ' '\t']* "ifndef" [^'\n']*
798 | "#" [' ' '\t']* "else" [^'\n']*
799 | "#" [' ' '\t']* "elif" [^'\n']*
800 | "#" [' ' '\t']* "endif" [^'\n']*
801 | "#" [' ' '\t']* "error" [^'\n']*
97111a47
C
802 | "#" [' ' '\t']* "pragma" [^'\n']*
803 | "#" [' ' '\t']* "line" [^'\n']*
34e49164 804 { start_line true; check_plus_linetype (tok lexbuf);
c3e37e97 805 TPragma (Ast.Noindent(tok lexbuf), get_current_line_type lexbuf) }
0708f913 806 | "/*"
8babbc8f
C
807 {
808 match !current_line_type with
809 (D.PLUS,_,_) | (D.PLUSPLUS,_,_) ->
810 start_line true;
0708f913 811 (* second argument to TPragma is not quite right, because
aa721442 812 it represents only the first token of the comment, but that
0708f913 813 should be good enough *)
8babbc8f
C
814 TPragma (Ast.Indent("/*"^(comment check_comment lexbuf)),
815 get_current_line_type lexbuf)
816 | _ -> let _ = comment (fun _ -> ()) lexbuf in token lexbuf }
34e49164
C
817 | "---" [^'\n']*
818 { (if !current_line_started
819 then lexerr "--- must be at the beginning of the line" "");
820 start_line true;
821 TMinusFile
822 (let str = tok lexbuf in
823 (drop_spaces(String.sub str 3 (String.length str - 3)),
824 (get_current_line_type lexbuf))) }
825 | "+++" [^'\n']*
826 { (if !current_line_started
827 then lexerr "+++ must be at the beginning of the line" "");
828 start_line true;
829 TPlusFile
830 (let str = tok lexbuf in
831 (drop_spaces(String.sub str 3 (String.length str - 3)),
832 (get_current_line_type lexbuf))) }
833
834 | letter (letter | digit)*
faf9a90c 835 { start_line true; id_tokens lexbuf }
34e49164 836
abad11c5
C
837 (* christia: testing *)
838 | (letter | '$') (letter | digit | '$') *
839 { start_line true; id_tokens lexbuf }
840
841 | (letter | '$') (letter | digit | '$') *
842 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?
843 ("::~" (letter | '$') (letter | digit | '$') *
844 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?) +
845
846 {
847 start_line true;
848 if not !Flag.c_plus_plus
849 then Common.pr2_once "< and > not allowed in C identifiers, try -c++ option";
850 id_tokens lexbuf
851 }
852 | ((letter | '$') (letter | digit | '$') * )
853 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>')
854
855 {
856 start_line true;
857 if not !Flag.c_plus_plus
858 then Common.pr2_once "< and > not allowed in C identifiers, try -c++ option";
859 id_tokens lexbuf
860 }
861
862 | (((letter | '$') (letter | digit | '$') * ))
863 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?
864 "::" (((letter | '$') (letter | digit | '$') * ))
865 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?
866 ("::" ((letter | '$') (letter | digit | '$') * )
867 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?) *
868
869 {
870 start_line true;
871 if not !Flag.c_plus_plus
872 then Common.pr2_once "~ and :: not allowed in C identifiers, try -c++ option";
873 id_tokens lexbuf
874 }
875
876 | "::" ((letter | '$') (letter | digit | '$') * )
877 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?
878 ("::" ((letter | '$') (letter | digit | '$') * )
879 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?) *
880 {
881 start_line true;
882 if not !Flag.c_plus_plus
883 then Common.pr2_once "~ and :: not allowed in C identifiers, try -c++ option";
884 id_tokens lexbuf
885 }
886 (* christia: end *)
887
888
34e49164
C
889 | "'" { start_line true;
890 TChar(char lexbuf,get_current_line_type lexbuf) }
abad11c5 891 | '\"' { start_line true;
34e49164
C
892 TString(string lexbuf,(get_current_line_type lexbuf)) }
893 | (real as x) { start_line true;
894 TFloat(x,(get_current_line_type lexbuf)) }
faf9a90c
C
895 | ((( decimal | hexa | octal)
896 ( ['u' 'U']
897 | ['l' 'L']
34e49164
C
898 | (['l' 'L'] ['u' 'U'])
899 | (['u' 'U'] ['l' 'L'])
900 | (['u' 'U'] ['l' 'L'] ['l' 'L'])
901 | (['l' 'L'] ['l' 'L'])
902 )?
903 ) as x) { start_line true; TInt(x,(get_current_line_type lexbuf)) }
904
905 | "<=>" { TIso }
906 | "=>" { TRightIso }
907
908 | eof { EOF }
909
910 | _ { lexerr "unrecognised symbol, in token rule: " (tok lexbuf) }
911
912
913and char = parse
f3c4ece6
C
914 | (_ as x) { String.make 1 x ^ restchars lexbuf }
915 (* todo?: as for octal, do exception beyond radix exception ? *)
916 | (("\\" (oct | oct oct | oct oct oct)) as x ) { x ^ restchars lexbuf }
917 (* this rule must be after the one with octal, lex try first longest
918 * and when \7 we want an octal, not an exn.
919 *)
920 | (("\\x" ((hex | hex hex))) as x ) { x ^ restchars lexbuf }
921 | (("\\" (_ as v)) as x )
922 {
923 (match v with (* Machine specific ? *)
924 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
925 | 'f' -> () | 'a' -> ()
abad11c5 926 | '\\' -> () | '?' -> () | '\'' -> () | '\"' -> ()
f3c4ece6
C
927 | 'e' -> () (* linuxext: ? *)
928 | _ ->
929 Common.pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
930 );
931 x ^ restchars lexbuf
932 }
933 | _
934 { Common.pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
935 tok lexbuf ^ restchars lexbuf
936 }
937
938and restchars = parse
939 | "'" { "" }
940 | (_ as x) { String.make 1 x ^ restchars lexbuf }
941 (* todo?: as for octal, do exception beyond radix exception ? *)
942 | (("\\" (oct | oct oct | oct oct oct)) as x ) { x ^ restchars lexbuf }
943 (* this rule must be after the one with octal, lex try first longest
944 * and when \7 we want an octal, not an exn.
945 *)
946 | (("\\x" ((hex | hex hex))) as x ) { x ^ restchars lexbuf }
947 | (("\\" (_ as v)) as x )
948 {
949 (match v with (* Machine specific ? *)
950 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
951 | 'f' -> () | 'a' -> ()
abad11c5 952 | '\\' -> () | '?' -> () | '\'' -> () | '\"' -> ()
f3c4ece6
C
953 | 'e' -> () (* linuxext: ? *)
954 | _ ->
955 Common.pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
956 );
957 x ^ restchars lexbuf
faf9a90c 958 }
f3c4ece6
C
959 | _
960 { Common.pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
961 tok lexbuf ^ restchars lexbuf
962 }
34e49164
C
963
964and string = parse
abad11c5 965 | '\"' { "" }
34e49164
C
966 | (_ as x) { Common.string_of_char x ^ string lexbuf }
967 | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
968 | ("\\x" (hex | hex hex)) as x { x ^ string lexbuf }
faf9a90c
C
969 | ("\\" (_ as v)) as x
970 {
34e49164 971 (match v with
9f8e26f4
C
972 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
973 | 'f' -> () | 'a' -> ()
abad11c5 974 | '\\' -> () | '?' -> () | '\'' -> () | '\"' -> ()
9f8e26f4
C
975 | 'e' -> ()
976 | '\n' -> ()
977 | '(' -> () | '|' -> () | ')' -> ()
978 | _ -> lexerr "unrecognised symbol:" (tok lexbuf)
34e49164
C
979 );
980 x ^ string lexbuf
981 }
982 | _ { lexerr "unrecognised symbol: " (tok lexbuf) }
0708f913 983
8babbc8f 984and comment check_comment = parse
aa721442 985 | "*/" { let s = tok lexbuf in check_comment s; start_line true; s }
0708f913 986 | ['\n' '\r' '\011' '\012']
aa721442
C
987 { let s = tok lexbuf in
988 (* even blank line should have a + *)
989 check_comment s;
8babbc8f 990 reset_line lexbuf; s ^ comment check_comment lexbuf }
0708f913
C
991 | "+" { pass_zero();
992 if !current_line_started
8babbc8f
C
993 then (start_line true;
994 let s = tok lexbuf in s^(comment check_comment lexbuf))
995 else (start_line true; comment check_comment lexbuf) }
0708f913 996 (* noteopti: *)
aa721442
C
997 | [^ '*']
998 { let s = tok lexbuf in
8babbc8f 999 check_comment s; start_line true; s ^ comment check_comment lexbuf }
aa721442
C
1000 | [ '*']
1001 { let s = tok lexbuf in
8babbc8f 1002 check_comment s; start_line true; s ^ comment check_comment lexbuf }
951c7801 1003 | _
0708f913
C
1004 { start_line true; let s = tok lexbuf in
1005 Common.pr2 ("LEXER: unrecognised symbol in comment:"^s);
8babbc8f 1006 s ^ comment check_comment lexbuf
0708f913
C
1007 }
1008