Switch to recommended form of GPLv3 permissions notice.
[bpt/emacs.git] / lisp / emacs-lisp / rx.el
CommitLineData
12c64503
GM
1;;; rx.el --- sexp notation for regular expressions
2
ceb4c4d3 3;; Copyright (C) 2001, 2002, 2003, 2004, 2005,
8b72699e 4;; 2006, 2007, 2008 Free Software Foundation, Inc.
12c64503
GM
5
6;; Author: Gerd Moellmann <gerd@gnu.org>
7;; Maintainer: FSF
8;; Keywords: strings, regexps, extensions
9
10;; This file is part of GNU Emacs.
11
d6cba7ae 12;; GNU Emacs is free software: you can redistribute it and/or modify
12c64503 13;; it under the terms of the GNU General Public License as published by
d6cba7ae
GM
14;; the Free Software Foundation, either version 3 of the License, or
15;; (at your option) any later version.
12c64503
GM
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
d6cba7ae 23;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
12c64503
GM
24
25;;; Commentary:
26
27;; This is another implementation of sexp-form regular expressions.
28;; It was unfortunately written without being aware of the Sregex
29;; package coming with Emacs, but as things stand, Rx completely
30;; covers all regexp features, which Sregex doesn't, doesn't suffer
31;; from the bugs mentioned in the commentary section of Sregex, and
32;; uses a nicer syntax (IMHO, of course :-).
33
ccfbe679
SM
34;; This significantly extended version of the original, is almost
35;; compatible with Sregex. The only incompatibility I (fx) know of is
36;; that the `repeat' form can't have multiple regexp args.
37
38;; Now alternative forms are provided for a degree of compatibility
39;; with Shivers' attempted definitive SRE notation
40;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not
41;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
42;; ,<exp>, (word ...), word+, posix-string, and character class forms.
43;; Some forms are inconsistent with SRE, either for historical reasons
44;; or because of the implementation -- simple translation into Emacs
45;; regexp strings. These include: any, word. Also, case-sensitivity
46;; and greediness are controlled by variables external to the regexp,
47;; and you need to feed the forms to the `posix-' functions to get
48;; SRE's POSIX semantics. There are probably more difficulties.
49
12c64503
GM
50;; Rx translates a sexp notation for regular expressions into the
51;; usual string notation. The translation can be done at compile-time
52;; by using the `rx' macro. It can be done at run-time by calling
53;; function `rx-to-string'. See the documentation of `rx' for a
54;; complete description of the sexp notation.
55;;
56;; Some examples of string regexps and their sexp counterparts:
57;;
58;; "^[a-z]*"
59;; (rx (and line-start (0+ (in "a-z"))))
60;;
61;; "\n[^ \t]"
62;; (rx (and "\n" (not blank))), or
63;; (rx (and "\n" (not (any " \t"))))
64;;
65;; "\\*\\*\\* EOOH \\*\\*\\*\n"
66;; (rx "*** EOOH ***\n")
67;;
68;; "\\<\\(catch\\|finally\\)\\>[^_]"
69;; (rx (and word-start (submatch (or "catch" "finally")) word-end
70;; (not (any ?_))))
71;;
72;; "[ \t\n]*:\\([^:]+\\|$\\)"
73;; (rx (and (zero-or-more (in " \t\n")) ":"
74;; (submatch (or line-end (one-or-more (not (any ?:)))))))
75;;
76;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
77;; (rx (and line-start
78;; "content-transfer-encoding:"
c53f9b3b 79;; (+ (? ?\n)) blank
12c64503 80;; "quoted-printable"
c53f9b3b 81;; (+ (? ?\n)) blank))
12c64503
GM
82;;
83;; (concat "^\\(?:" something-else "\\)")
84;; (rx (and line-start (eval something-else))), statically or
85;; (rx-to-string '(and line-start ,something-else)), dynamically.
86;;
87;; (regexp-opt '(STRING1 STRING2 ...))
88;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
89;; calls `regexp-opt' as needed.
90;;
91;; "^;;\\s-*\n\\|^\n"
92;; (rx (or (and line-start ";;" (0+ space) ?\n)
93;; (and line-start ?\n)))
94;;
95;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
a1506d29
JB
96;; (rx (and "$Id: "
97;; (1+ (not (in " ")))
12c64503
GM
98;; " "
99;; (submatch (1+ (not (in " "))))
c53f9b3b 100;; " "))
12c64503
GM
101;;
102;; "\\\\\\\\\\[\\w+"
103;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
104;;
105;; etc.
106
107;;; History:
a1506d29 108;;
12c64503
GM
109
110;;; Code:
111
12c64503
GM
112(defconst rx-constituents
113 '((and . (rx-and 1 nil))
ccfbe679
SM
114 (seq . and) ; SRE
115 (: . and) ; SRE
116 (sequence . and) ; sregex
12c64503 117 (or . (rx-or 1 nil))
ccfbe679 118 (| . or) ; SRE
12c64503 119 (not-newline . ".")
ccfbe679 120 (nonl . not-newline) ; SRE
f61fd6b7 121 (anything . "\\(?:.\\|\n\\)")
ccfbe679 122 (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
12c64503 123 (in . any)
ccfbe679
SM
124 (char . any) ; sregex
125 (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex
12c64503 126 (not . (rx-not 1 1 rx-check-not))
ccfbe679
SM
127 ;; Partially consistent with sregex, whose `repeat' is like our
128 ;; `**'. (`repeat' with optional max arg and multiple sexp forms
129 ;; is ambiguous.)
12c64503 130 (repeat . (rx-repeat 2 3))
ccfbe679
SM
131 (= . (rx-= 2 nil)) ; SRE
132 (>= . (rx->= 2 nil)) ; SRE
133 (** . (rx-** 2 nil)) ; SRE
134 (submatch . (rx-submatch 1 nil)) ; SRE
12c64503 135 (group . submatch)
ccfbe679
SM
136 (zero-or-more . (rx-kleene 1 nil))
137 (one-or-more . (rx-kleene 1 nil))
138 (zero-or-one . (rx-kleene 1 nil))
139 (\? . zero-or-one) ; SRE
12c64503 140 (\?? . zero-or-one)
ccfbe679 141 (* . zero-or-more) ; SRE
12c64503
GM
142 (*? . zero-or-more)
143 (0+ . zero-or-more)
ccfbe679 144 (+ . one-or-more) ; SRE
12c64503
GM
145 (+? . one-or-more)
146 (1+ . one-or-more)
147 (optional . zero-or-one)
ccfbe679 148 (opt . zero-or-one) ; sregex
12c64503
GM
149 (minimal-match . (rx-greedy 1 1))
150 (maximal-match . (rx-greedy 1 1))
740b7c2d 151 (backref . (rx-backref 1 1 rx-check-backref))
12c64503 152 (line-start . "^")
ccfbe679 153 (bol . line-start) ; SRE
12c64503 154 (line-end . "$")
ccfbe679 155 (eol . line-end) ; SRE
12c64503 156 (string-start . "\\`")
ccfbe679
SM
157 (bos . string-start) ; SRE
158 (bot . string-start) ; sregex
12c64503 159 (string-end . "\\'")
ccfbe679
SM
160 (eos . string-end) ; SRE
161 (eot . string-end) ; sregex
12c64503
GM
162 (buffer-start . "\\`")
163 (buffer-end . "\\'")
164 (point . "\\=")
165 (word-start . "\\<")
ccfbe679 166 (bow . word-start) ; SRE
12c64503 167 (word-end . "\\>")
ccfbe679 168 (eow . word-end) ; SRE
12c64503 169 (word-boundary . "\\b")
ccfbe679 170 (not-word-boundary . "\\B") ; sregex
b62c13c2
SM
171 (symbol-start . "\\_<")
172 (symbol-end . "\\_>")
12c64503 173 (syntax . (rx-syntax 1 1))
ccfbe679 174 (not-syntax . (rx-not-syntax 1 1)) ; sregex
12c64503
GM
175 (category . (rx-category 1 1 rx-check-category))
176 (eval . (rx-eval 1 1))
177 (regexp . (rx-regexp 1 1 stringp))
178 (digit . "[[:digit:]]")
ccfbe679
SM
179 (numeric . digit) ; SRE
180 (num . digit) ; SRE
181 (control . "[[:cntrl:]]") ; SRE
182 (cntrl . control) ; SRE
183 (hex-digit . "[[:xdigit:]]") ; SRE
184 (hex . hex-digit) ; SRE
185 (xdigit . hex-digit) ; SRE
186 (blank . "[[:blank:]]") ; SRE
187 (graphic . "[[:graph:]]") ; SRE
188 (graph . graphic) ; SRE
189 (printing . "[[:print:]]") ; SRE
190 (print . printing) ; SRE
191 (alphanumeric . "[[:alnum:]]") ; SRE
192 (alnum . alphanumeric) ; SRE
12c64503 193 (letter . "[[:alpha:]]")
ccfbe679
SM
194 (alphabetic . letter) ; SRE
195 (alpha . letter) ; SRE
196 (ascii . "[[:ascii:]]") ; SRE
12c64503 197 (nonascii . "[[:nonascii:]]")
ccfbe679
SM
198 (lower . "[[:lower:]]") ; SRE
199 (lower-case . lower) ; SRE
200 (punctuation . "[[:punct:]]") ; SRE
201 (punct . punctuation) ; SRE
202 (space . "[[:space:]]") ; SRE
203 (whitespace . space) ; SRE
204 (white . space) ; SRE
205 (upper . "[[:upper:]]") ; SRE
206 (upper-case . upper) ; SRE
207 (word . "[[:word:]]") ; inconsistent with SRE
208 (wordchar . word) ; sregex
209 (not-wordchar . "[^[:word:]]") ; sregex (use \\W?)
210 )
12c64503
GM
211 "Alist of sexp form regexp constituents.
212Each element of the alist has the form (SYMBOL . DEFN).
213SYMBOL is a valid constituent of sexp regular expressions.
214If DEFN is a string, SYMBOL is translated into DEFN.
215If DEFN is a symbol, use the definition of DEFN, recursively.
216Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
217FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS
218are the minimum and maximum number of arguments the function-form
219sexp constituent SYMBOL may have in sexp regular expressions.
220MAX-ARGS nil means no limit. PREDICATE, if specified, means that
221all arguments must satisfy PREDICATE.")
222
223
224(defconst rx-syntax
225 '((whitespace . ?-)
226 (punctuation . ?.)
227 (word . ?w)
228 (symbol . ?_)
229 (open-parenthesis . ?\()
230 (close-parenthesis . ?\))
231 (expression-prefix . ?\')
232 (string-quote . ?\")
233 (paired-delimiter . ?$)
234 (escape . ?\\)
235 (character-quote . ?/)
236 (comment-start . ?<)
740b7c2d
EZ
237 (comment-end . ?>)
238 (string-delimiter . ?|)
09c774f7 239 (comment-delimiter . ?!))
12c64503
GM
240 "Alist mapping Rx syntax symbols to syntax characters.
241Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
242symbol in `(syntax SYMBOL)', and CHAR is the syntax character
243corresponding to SYMBOL, as it would be used with \\s or \\S in
244regular expressions.")
245
246
247(defconst rx-categories
248 '((consonant . ?0)
249 (base-vowel . ?1)
250 (upper-diacritical-mark . ?2)
251 (lower-diacritical-mark . ?3)
252 (tone-mark . ?4)
253 (symbol . ?5)
254 (digit . ?6)
255 (vowel-modifying-diacritical-mark . ?7)
256 (vowel-sign . ?8)
257 (semivowel-lower . ?9)
258 (not-at-end-of-line . ?<)
259 (not-at-beginning-of-line . ?>)
260 (alpha-numeric-two-byte . ?A)
261 (chinse-two-byte . ?C)
262 (greek-two-byte . ?G)
263 (japanese-hiragana-two-byte . ?H)
264 (indian-two-byte . ?I)
265 (japanese-katakana-two-byte . ?K)
266 (korean-hangul-two-byte . ?N)
267 (cyrillic-two-byte . ?Y)
740b7c2d 268 (combining-diacritic . ?^)
12c64503
GM
269 (ascii . ?a)
270 (arabic . ?b)
271 (chinese . ?c)
272 (ethiopic . ?e)
273 (greek . ?g)
274 (korean . ?h)
275 (indian . ?i)
276 (japanese . ?j)
277 (japanese-katakana . ?k)
278 (latin . ?l)
279 (lao . ?o)
280 (tibetan . ?q)
281 (japanese-roman . ?r)
282 (thai . ?t)
283 (vietnamese . ?v)
284 (hebrew . ?w)
285 (cyrillic . ?y)
286 (can-break . ?|))
287 "Alist mapping symbols to category characters.
288Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
289symbol in `(category SYMBOL)', and CHAR is the category character
290corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
291regular expression strings.")
292
293
294(defvar rx-greedy-flag t
295 "Non-nil means produce greedy regular expressions for `zero-or-one',
296`zero-or-more', and `one-or-more'. Dynamically bound.")
297
298
299(defun rx-info (op)
300 "Return parsing/code generation info for OP.
301If OP is the space character ASCII 32, return info for the symbol `?'.
302If OP is the character `?', return info for the symbol `??'.
303See also `rx-constituents'."
304 (cond ((eq op ? ) (setq op '\?))
305 ((eq op ??) (setq op '\??)))
306 (while (and (not (null op)) (symbolp op))
307 (setq op (cdr (assq op rx-constituents))))
308 op)
a1506d29 309
12c64503
GM
310
311(defun rx-check (form)
312 "Check FORM according to its car's parsing info."
ccfbe679
SM
313 (unless (listp form)
314 (error "rx `%s' needs argument(s)" form))
12c64503
GM
315 (let* ((rx (rx-info (car form)))
316 (nargs (1- (length form)))
317 (min-args (nth 1 rx))
318 (max-args (nth 2 rx))
319 (type-pred (nth 3 rx)))
320 (when (and (not (null min-args))
321 (< nargs min-args))
740b7c2d 322 (error "rx form `%s' requires at least %d args"
12c64503
GM
323 (car form) min-args))
324 (when (and (not (null max-args))
325 (> nargs max-args))
740b7c2d 326 (error "rx form `%s' accepts at most %d args"
12c64503
GM
327 (car form) max-args))
328 (when (not (null type-pred))
329 (dolist (sub-form (cdr form))
330 (unless (funcall type-pred sub-form)
740b7c2d 331 (error "rx form `%s' requires args satisfying `%s'"
12c64503
GM
332 (car form) type-pred))))))
333
334
335(defun rx-and (form)
336 "Parse and produce code from FORM.
337FORM is of the form `(and FORM1 ...)'."
338 (rx-check form)
c53f9b3b
RS
339 (concat "\\(?:"
340 (mapconcat
341 (function (lambda (x) (rx-to-string x 'no-group)))
342 (cdr form) nil)
343 "\\)"))
12c64503
GM
344
345
346(defun rx-or (form)
347 "Parse and produce code from FORM, which is `(or FORM1 ...)'."
348 (rx-check form)
349 (let ((all-args-strings t))
350 (dolist (arg (cdr form))
351 (unless (stringp arg)
352 (setq all-args-strings nil)))
cdddbfd2
EZ
353 (concat "\\(?:"
354 (if all-args-strings
355 (regexp-opt (cdr form))
356 (mapconcat #'rx-to-string (cdr form) "\\|"))
357 "\\)")))
12c64503
GM
358
359
09c774f7 360(defvar rx-bracket) ; dynamically bound in `rx-any'
12c64503
GM
361
362(defun rx-check-any (arg)
363 "Check arg ARG for Rx `any'."
ccfbe679
SM
364 (if (integerp arg)
365 (setq arg (string arg)))
366 (when (stringp arg)
367 (if (zerop (length arg))
368 (error "String arg for Rx `any' must not be empty"))
369 ;; Quote ^ at start; don't bother to check whether this is first arg.
370 (if (eq ?^ (aref arg 0))
371 (setq arg (concat "\\" arg)))
372 ;; Remove ] and set flag for adding it to start of overall result.
a6966c1c
EZ
373 (when (string-match "\\]" arg)
374 (setq arg (replace-regexp-in-string "\\]" "" arg)
09c774f7 375 rx-bracket "]")))
ccfbe679
SM
376 (when (symbolp arg)
377 (let ((translation (condition-case nil
378 (rx-to-string arg 'no-group)
379 (error nil))))
380 (unless translation (error "Invalid char class `%s' in Rx `any'" arg))
381 (setq arg (substring translation 1 -1)))) ; strip outer brackets
382 ;; sregex compatibility
383 (when (and (integerp (car-safe arg))
384 (integerp (cdr-safe arg)))
385 (setq arg (string (car arg) ?- (cdr arg))))
386 (unless (stringp arg)
387 (error "rx `any' requires string, character, char pair or char class args"))
388 arg)
12c64503
GM
389
390(defun rx-any (form)
ccfbe679
SM
391 "Parse and produce code from FORM, which is `(any ARG ...)'.
392ARG is optional."
12c64503 393 (rx-check form)
09c774f7
SM
394 (let* ((rx-bracket nil)
395 (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `rx-bracket'
ccfbe679
SM
396 ;; If there was a ?- in the form, move it to the front to avoid
397 ;; accidental range.
398 (if (member "-" args)
399 (setq args (cons "-" (delete "-" args))))
09c774f7 400 (apply #'concat "[" rx-bracket (append args '("]")))))
12c64503
GM
401
402
740b7c2d
EZ
403(defun rx-check-not (arg)
404 "Check arg ARG for Rx `not'."
ccfbe679 405 (unless (or (and (symbolp arg)
a6966c1c 406 (string-match "\\`\\[\\[:[-a-z]:\\]\\]\\'"
ccfbe679
SM
407 (condition-case nil
408 (rx-to-string arg 'no-group)
409 (error ""))))
410 (eq arg 'word-boundary)
411 (and (consp arg)
412 (memq (car arg) '(not any in syntax category))))
413 (error "rx `not' syntax error: %s" arg))
414 t)
12c64503
GM
415
416
417(defun rx-not (form)
418 "Parse and produce code from FORM. FORM is `(not ...)'."
419 (rx-check form)
062a9fce
EZ
420 (let ((result (rx-to-string (cadr form) 'no-group))
421 case-fold-search)
12c64503
GM
422 (cond ((string-match "\\`\\[^" result)
423 (if (= (length result) 4)
424 (substring result 2 3)
425 (concat "[" (substring result 2))))
ccfbe679 426 ((eq ?\[ (aref result 0))
12c64503 427 (concat "[^" (substring result 1)))
ccfbe679
SM
428 ((string-match "\\`\\\\[scb]" result)
429 (concat (capitalize (substring result 0 2)) (substring result 2)))
12c64503
GM
430 (t
431 (concat "[^" result "]")))))
432
433
ccfbe679
SM
434(defun rx-not-char (form)
435 "Parse and produce code from FORM. FORM is `(not-char ...)'."
436 (rx-check form)
437 (rx-not `(not (in ,@(cdr form)))))
438
439
440(defun rx-not-syntax (form)
441 "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'."
442 (rx-check form)
443 (rx-not `(not (syntax ,@(cdr form)))))
444
445
446(defun rx-trans-forms (form &optional skip)
447 "If FORM's length is greater than two, transform it to length two.
448A form (HEAD REST ...) becomes (HEAD (and REST ...)).
449If SKIP is non-nil, allow that number of items after the head, i.e.
450`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
451 (unless skip (setq skip 0))
452 (let ((tail (nthcdr (1+ skip) form)))
453 (if (= (length tail) 1)
454 form
455 (let ((form (copy-sequence form)))
456 (setcdr (nthcdr skip form) (list (cons 'and tail)))
457 form))))
458
459
460(defun rx-= (form)
461 "Parse and produce code from FORM `(= N ...)'."
462 (rx-check form)
463 (setq form (rx-trans-forms form 1))
464 (unless (and (integerp (nth 1 form))
465 (> (nth 1 form) 0))
466 (error "rx `=' requires positive integer first arg"))
467 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
468
469
470(defun rx->= (form)
471 "Parse and produce code from FORM `(>= N ...)'."
472 (rx-check form)
473 (setq form (rx-trans-forms form 1))
474 (unless (and (integerp (nth 1 form))
475 (> (nth 1 form) 0))
476 (error "rx `>=' requires positive integer first arg"))
477 (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
478
479
480(defun rx-** (form)
481 "Parse and produce code from FORM `(** N M ...)'."
482 (rx-check form)
483 (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
484 (rx-to-string form))
485
486
12c64503
GM
487(defun rx-repeat (form)
488 "Parse and produce code from FORM.
489FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
490 (rx-check form)
491 (cond ((= (length form) 3)
492 (unless (and (integerp (nth 1 form))
493 (> (nth 1 form) 0))
740b7c2d 494 (error "rx `repeat' requires positive integer first arg"))
12c64503
GM
495 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
496 ((or (not (integerp (nth 2 form)))
497 (< (nth 2 form) 0)
498 (not (integerp (nth 1 form)))
499 (< (nth 1 form) 0)
500 (< (nth 2 form) (nth 1 form)))
740b7c2d 501 (error "rx `repeat' range error"))
12c64503
GM
502 (t
503 (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form))
504 (nth 1 form) (nth 2 form)))))
505
506
507(defun rx-submatch (form)
508 "Parse and produce code from FORM, which is `(submatch ...)'."
c53f9b3b
RS
509 (concat "\\("
510 (mapconcat (function (lambda (x) (rx-to-string x 'no-group)))
511 (cdr form) nil)
512 "\\)"))
12c64503 513
740b7c2d
EZ
514(defun rx-backref (form)
515 "Parse and produce code from FORM, which is `(backref N)'."
516 (rx-check form)
517 (format "\\%d" (nth 1 form)))
518
519(defun rx-check-backref (arg)
520 "Check arg ARG for Rx `backref'."
521 (or (and (integerp arg) (>= arg 1) (<= arg 9))
522 (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
523
12c64503
GM
524(defun rx-kleene (form)
525 "Parse and produce code from FORM.
526FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
a1506d29 527`zero-or-more' etc. operators.
12c64503
GM
528If OP is one of `*', `+', `?', produce a greedy regexp.
529If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
530If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
531is non-nil."
532 (rx-check form)
ccfbe679 533 (setq form (rx-trans-forms form))
12c64503
GM
534 (let ((suffix (cond ((memq (car form) '(* + ? )) "")
535 ((memq (car form) '(*? +? ??)) "?")
536 (rx-greedy-flag "")
537 (t "?")))
538 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
539 ((memq (car form) '(+ +? 1+ one-or-more)) "+")
c53f9b3b
RS
540 (t "?")))
541 (result (rx-to-string (cadr form) 'no-group)))
542 (if (not (rx-atomic-p result))
543 (setq result (concat "\\(?:" result "\\)")))
544 (concat result op suffix)))
545
546(defun rx-atomic-p (r)
547 "Return non-nil if regexp string R is atomic.
548An atomic regexp R is one such that a suffix operator
549appended to R will apply to all of R. For example, \"a\"
550\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
551\"[ab]c\", and \"ab\\|ab*c\" are not atomic.
552
553This function may return false negatives, but it will not
554return false positives. It is nevertheless useful in
ab2d877d 555situations where an efficiency shortcut can be taken only if a
c53f9b3b
RS
556regexp is atomic. The function can be improved to detect
557more cases of atomic regexps. Presently, this function
558detects the following categories of atomic regexp;
559
560 a group or shy group: \\(...\\)
561 a character class: [...]
562 a single character: a
563
564On the other hand, false negatives will be returned for
565regexps that are atomic but end in operators, such as
566\"a+\". I think these are rare. Probably such cases could
567be detected without much effort. A guarantee of no false
568negatives would require a theoretic specification of the set
569of all atomic regexps."
570 (let ((l (length r)))
571 (or (equal l 1)
572 (and (>= l 6)
573 (equal (substring r 0 2) "\\(")
574 (equal (substring r -2) "\\)"))
575 (and (>= l 2)
576 (equal (substring r 0 1) "[")
577 (equal (substring r -1) "]")))))
12c64503
GM
578
579
580(defun rx-syntax (form)
581 "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
582 (rx-check form)
09c774f7
SM
583 (let* ((sym (cadr form))
584 (syntax (assq sym rx-syntax)))
12c64503 585 (unless syntax
09c774f7
SM
586 ;; Try sregex compatibility.
587 (let ((name (symbol-name sym)))
588 (if (= 1 (length name))
589 (setq syntax (rassq (aref name 0) rx-syntax))))
590 (unless syntax
591 (error "Unknown rx syntax `%s'" (cadr form))))
12c64503
GM
592 (format "\\s%c" (cdr syntax))))
593
594
595(defun rx-check-category (form)
596 "Check the argument FORM of a `(category FORM)'."
597 (unless (or (integerp form)
598 (cdr (assq form rx-categories)))
599 (error "Unknown category `%s'" form))
600 t)
a1506d29 601
12c64503
GM
602
603(defun rx-category (form)
ccfbe679 604 "Parse and produce code from FORM, which is `(category SYMBOL)'."
12c64503
GM
605 (rx-check form)
606 (let ((char (if (integerp (cadr form))
607 (cadr form)
608 (cdr (assq (cadr form) rx-categories)))))
609 (format "\\c%c" char)))
610
611
612(defun rx-eval (form)
613 "Parse and produce code from FORM, which is `(eval FORM)'."
614 (rx-check form)
615 (rx-to-string (eval (cadr form))))
616
617
618(defun rx-greedy (form)
740b7c2d
EZ
619 "Parse and produce code from FORM.
620If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
621`+', and `?' operators will be used in FORM1. If FORM is
622'(maximal-match FORM1)', greedy operators will be used."
12c64503
GM
623 (rx-check form)
624 (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
625 (rx-to-string (cadr form))))
626
627
628(defun rx-regexp (form)
629 "Parse and produce code from FORM, which is `(regexp STRING)'."
630 (rx-check form)
631 (concat "\\(?:" (cadr form) "\\)"))
632
633
634;;;###autoload
635(defun rx-to-string (form &optional no-group)
636 "Parse and produce code for regular expression FORM.
637FORM is a regular expression in sexp form.
638NO-GROUP non-nil means don't put shy groups around the result."
639 (cond ((stringp form)
640 (regexp-quote form))
641 ((integerp form)
642 (regexp-quote (char-to-string form)))
643 ((symbolp form)
644 (let ((info (rx-info form)))
645 (cond ((stringp info)
646 info)
647 ((null info)
740b7c2d 648 (error "Unknown rx form `%s'" form))
a1506d29 649 (t
12c64503
GM
650 (funcall (nth 0 info) form)))))
651 ((consp form)
652 (let ((info (rx-info (car form))))
653 (unless (consp info)
740b7c2d 654 (error "Unknown rx form `%s'" (car form)))
12c64503
GM
655 (let ((result (funcall (nth 0 info) form)))
656 (if (or no-group (string-match "\\`\\\\[(]" result))
657 result
658 (concat "\\(?:" result "\\)")))))
659 (t
740b7c2d 660 (error "rx syntax error at `%s'" form))))
12c64503
GM
661
662
663;;;###autoload
ccfbe679
SM
664(defmacro rx (&rest regexps)
665 "Translate regular expressions REGEXPS in sexp form to a regexp string.
666REGEXPS is a non-empty sequence of forms of the sort listed below.
12c64503
GM
667See also `rx-to-string' for how to do such a translation at run-time.
668
669The following are valid subforms of regular expressions in sexp
670notation.
671
672STRING
673 matches string STRING literally.
674
675CHAR
676 matches character CHAR literally.
677
ccfbe679 678`not-newline', `nonl'
12c64503 679 matches any character except a newline.
e8449cdb 680
12c64503
GM
681`anything'
682 matches any character
683
ccfbe679
SM
684`(any SET ...)'
685`(in SET ...)'
686`(char SET ...)'
687 matches any character in SET .... SET may be a character or string.
12c64503 688 Ranges of characters can be specified as `A-Z' in strings.
ccfbe679 689 Ranges may also be specified as conses like `(?A . ?Z)'.
12c64503 690
ccfbe679
SM
691 SET may also be the name of a character class: `digit',
692 `control', `hex-digit', `blank', `graph', `print', `alnum',
693 `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
694 `word', or one of their synonyms.
12c64503 695
ccfbe679
SM
696`(not (any SET ...))'
697 matches any character not in SET ...
12c64503 698
ccfbe679 699`line-start', `bol'
12c64503
GM
700 matches the empty string, but only at the beginning of a line
701 in the text being matched
702
ccfbe679 703`line-end', `eol'
12c64503
GM
704 is similar to `line-start' but matches only at the end of a line
705
ccfbe679 706`string-start', `bos', `bot'
12c64503
GM
707 matches the empty string, but only at the beginning of the
708 string being matched against.
709
ccfbe679 710`string-end', `eos', `eot'
12c64503
GM
711 matches the empty string, but only at the end of the
712 string being matched against.
713
714`buffer-start'
715 matches the empty string, but only at the beginning of the
ccfbe679 716 buffer being matched against. Actually equivalent to `string-start'.
12c64503
GM
717
718`buffer-end'
719 matches the empty string, but only at the end of the
ccfbe679 720 buffer being matched against. Actually equivalent to `string-end'.
12c64503
GM
721
722`point'
723 matches the empty string, but only at point.
724
ccfbe679 725`word-start', `bow'
5e3fc9eb 726 matches the empty string, but only at the beginning of a word.
12c64503 727
ccfbe679 728`word-end', `eow'
12c64503
GM
729 matches the empty string, but only at the end of a word.
730
731`word-boundary'
732 matches the empty string, but only at the beginning or end of a
733 word.
734
735`(not word-boundary)'
ccfbe679 736`not-word-boundary'
12c64503
GM
737 matches the empty string, but not at the beginning or end of a
738 word.
739
5e3fc9eb
GM
740`symbol-start'
741 matches the empty string, but only at the beginning of a symbol.
742
743`symbol-end'
744 matches the empty string, but only at the end of a symbol.
745
ccfbe679 746`digit', `numeric', `num'
12c64503
GM
747 matches 0 through 9.
748
ccfbe679 749`control', `cntrl'
12c64503
GM
750 matches ASCII control characters.
751
ccfbe679 752`hex-digit', `hex', `xdigit'
12c64503
GM
753 matches 0 through 9, a through f and A through F.
754
755`blank'
756 matches space and tab only.
757
ccfbe679 758`graphic', `graph'
12c64503
GM
759 matches graphic characters--everything except ASCII control chars,
760 space, and DEL.
761
ccfbe679 762`printing', `print'
12c64503
GM
763 matches printing characters--everything except ASCII control chars
764 and DEL.
765
ccfbe679 766`alphanumeric', `alnum'
12c64503
GM
767 matches letters and digits. (But at present, for multibyte characters,
768 it matches anything that has word syntax.)
769
ccfbe679 770`letter', `alphabetic', `alpha'
12c64503
GM
771 matches letters. (But at present, for multibyte characters,
772 it matches anything that has word syntax.)
773
774`ascii'
775 matches ASCII (unibyte) characters.
776
777`nonascii'
778 matches non-ASCII (multibyte) characters.
779
ccfbe679 780`lower', `lower-case'
12c64503
GM
781 matches anything lower-case.
782
ccfbe679 783`upper', `upper-case'
12c64503
GM
784 matches anything upper-case.
785
ccfbe679 786`punctuation', `punct'
12c64503
GM
787 matches punctuation. (But at present, for multibyte characters,
788 it matches anything that has non-word syntax.)
789
ccfbe679 790`space', `whitespace', `white'
12c64503
GM
791 matches anything that has whitespace syntax.
792
ccfbe679 793`word', `wordchar'
12c64503
GM
794 matches anything that has word syntax.
795
ccfbe679
SM
796`not-wordchar'
797 matches anything that has non-word syntax.
798
12c64503
GM
799`(syntax SYNTAX)'
800 matches a character with syntax SYNTAX. SYNTAX must be one
ccfbe679
SM
801 of the following symbols, or a symbol corresponding to the syntax
802 character, e.g. `\\.' for `\\s.'.
12c64503
GM
803
804 `whitespace' (\\s- in string notation)
805 `punctuation' (\\s.)
806 `word' (\\sw)
807 `symbol' (\\s_)
808 `open-parenthesis' (\\s()
809 `close-parenthesis' (\\s))
810 `expression-prefix' (\\s')
811 `string-quote' (\\s\")
812 `paired-delimiter' (\\s$)
813 `escape' (\\s\\)
814 `character-quote' (\\s/)
815 `comment-start' (\\s<)
816 `comment-end' (\\s>)
740b7c2d
EZ
817 `string-delimiter' (\\s|)
818 `comment-delimiter' (\\s!)
12c64503
GM
819
820`(not (syntax SYNTAX))'
ccfbe679 821 matches a character that doesn't have syntax SYNTAX.
12c64503
GM
822
823`(category CATEGORY)'
824 matches a character with category CATEGORY. CATEGORY must be
825 either a character to use for C, or one of the following symbols.
826
827 `consonant' (\\c0 in string notation)
828 `base-vowel' (\\c1)
829 `upper-diacritical-mark' (\\c2)
830 `lower-diacritical-mark' (\\c3)
831 `tone-mark' (\\c4)
832 `symbol' (\\c5)
833 `digit' (\\c6)
834 `vowel-modifying-diacritical-mark' (\\c7)
835 `vowel-sign' (\\c8)
836 `semivowel-lower' (\\c9)
837 `not-at-end-of-line' (\\c<)
838 `not-at-beginning-of-line' (\\c>)
839 `alpha-numeric-two-byte' (\\cA)
840 `chinse-two-byte' (\\cC)
841 `greek-two-byte' (\\cG)
842 `japanese-hiragana-two-byte' (\\cH)
843 `indian-tow-byte' (\\cI)
844 `japanese-katakana-two-byte' (\\cK)
845 `korean-hangul-two-byte' (\\cN)
846 `cyrillic-two-byte' (\\cY)
ccfbe679 847 `combining-diacritic' (\\c^)
12c64503
GM
848 `ascii' (\\ca)
849 `arabic' (\\cb)
850 `chinese' (\\cc)
851 `ethiopic' (\\ce)
852 `greek' (\\cg)
853 `korean' (\\ch)
854 `indian' (\\ci)
855 `japanese' (\\cj)
856 `japanese-katakana' (\\ck)
857 `latin' (\\cl)
858 `lao' (\\co)
859 `tibetan' (\\cq)
860 `japanese-roman' (\\cr)
861 `thai' (\\ct)
862 `vietnamese' (\\cv)
863 `hebrew' (\\cw)
864 `cyrillic' (\\cy)
865 `can-break' (\\c|)
866
867`(not (category CATEGORY))'
ccfbe679 868 matches a character that doesn't have category CATEGORY.
12c64503
GM
869
870`(and SEXP1 SEXP2 ...)'
ccfbe679
SM
871`(: SEXP1 SEXP2 ...)'
872`(seq SEXP1 SEXP2 ...)'
873`(sequence SEXP1 SEXP2 ...)'
12c64503
GM
874 matches what SEXP1 matches, followed by what SEXP2 matches, etc.
875
876`(submatch SEXP1 SEXP2 ...)'
ccfbe679 877`(group SEXP1 SEXP2 ...)'
12c64503
GM
878 like `and', but makes the match accessible with `match-end',
879 `match-beginning', and `match-string'.
880
881`(group SEXP1 SEXP2 ...)'
882 another name for `submatch'.
883
884`(or SEXP1 SEXP2 ...)'
ccfbe679 885`(| SEXP1 SEXP2 ...)'
12c64503
GM
886 matches anything that matches SEXP1 or SEXP2, etc. If all
887 args are strings, use `regexp-opt' to optimize the resulting
888 regular expression.
889
890`(minimal-match SEXP)'
891 produce a non-greedy regexp for SEXP. Normally, regexps matching
740b7c2d 892 zero or more occurrences of something are \"greedy\" in that they
12c64503
GM
893 match as much as they can, as long as the overall regexp can
894 still match. A non-greedy regexp matches as little as possible.
895
896`(maximal-match SEXP)'
0a6cac62 897 produce a greedy regexp for SEXP. This is the default.
12c64503 898
ccfbe679
SM
899Below, `SEXP ...' represents a sequence of regexp forms, treated as if
900enclosed in `(and ...)'.
12c64503 901
ccfbe679
SM
902`(zero-or-more SEXP ...)'
903`(0+ SEXP ...)'
904 matches zero or more occurrences of what SEXP ... matches.
12c64503 905
ccfbe679
SM
906`(* SEXP ...)'
907 like `zero-or-more', but always produces a greedy regexp, independent
908 of `rx-greedy-flag'.
12c64503 909
ccfbe679
SM
910`(*? SEXP ...)'
911 like `zero-or-more', but always produces a non-greedy regexp,
912 independent of `rx-greedy-flag'.
a1506d29 913
ccfbe679
SM
914`(one-or-more SEXP ...)'
915`(1+ SEXP ...)'
916 matches one or more occurrences of SEXP ...
12c64503 917
ccfbe679 918`(+ SEXP ...)'
12c64503
GM
919 like `one-or-more', but always produces a greedy regexp.
920
ccfbe679 921`(+? SEXP ...)'
12c64503
GM
922 like `one-or-more', but always produces a non-greedy regexp.
923
ccfbe679
SM
924`(zero-or-one SEXP ...)'
925`(optional SEXP ...)'
926`(opt SEXP ...)'
12c64503 927 matches zero or one occurrences of A.
a1506d29 928
ccfbe679 929`(? SEXP ...)'
12c64503
GM
930 like `zero-or-one', but always produces a greedy regexp.
931
ccfbe679 932`(?? SEXP ...)'
12c64503
GM
933 like `zero-or-one', but always produces a non-greedy regexp.
934
935`(repeat N SEXP)'
ccfbe679
SM
936`(= N SEXP ...)'
937 matches N occurrences.
938
939`(>= N SEXP ...)'
940 matches N or more occurrences.
12c64503
GM
941
942`(repeat N M SEXP)'
ccfbe679
SM
943`(** N M SEXP ...)'
944 matches N to M occurrences.
945
946`(backref N)'
947 matches what was matched previously by submatch N.
12c64503 948
942269e7
JB
949`(backref N)'
950 matches what was matched previously by submatch N.
951
05ec033b
EZ
952`(backref N)'
953 matches what was matched previously by submatch N.
954
12c64503 955`(eval FORM)'
942269e7
JB
956 evaluate FORM and insert result. If result is a string,
957 `regexp-quote' it.
12c64503
GM
958
959`(regexp REGEXP)'
942269e7 960 include REGEXP in string notation in the result."
ccfbe679
SM
961 (cond ((null regexps)
962 (error "No regexp"))
963 ((cdr regexps)
964 (rx-to-string `(and ,@regexps) t))
965 (t
966 (rx-to-string (car regexps) t))))
967\f
968;; ;; sregex.el replacement
969
970;; ;;;###autoload (provide 'sregex)
971;; ;;;###autoload (autoload 'sregex "rx")
972;; (defalias 'sregex 'rx-to-string)
973;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
974;; (defalias 'sregexq 'rx)
975\f
12c64503
GM
976(provide 'rx)
977
b62c13c2 978;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
12c64503 979;;; rx.el ends here