(reduce, fill, replace, remove*, remove-if, remove-if-not, delete*, delete-if,
[bpt/emacs.git] / lisp / emacs-lisp / rx.el
CommitLineData
12c64503
GM
1;;; rx.el --- sexp notation for regular expressions
2
b62c13c2 3;; Copyright (C) 2001, 2003, 2004, 2005 Free Software Foundation, Inc.
12c64503
GM
4
5;; Author: Gerd Moellmann <gerd@gnu.org>
6;; Maintainer: FSF
7;; Keywords: strings, regexps, extensions
8
9;; This file is part of GNU Emacs.
10
11;; GNU Emacs is free software; you can redistribute it and/or modify
12;; it under the terms of the GNU General Public License as published by
13;; the Free Software Foundation; either version 2, or (at your option)
14;; any later version.
15
16;; GNU Emacs is distributed in the hope that it will be useful,
17;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19;; GNU General Public License for more details.
20
21;; You should have received a copy of the GNU General Public License
22;; along with GNU Emacs; see the file COPYING. If not, write to the
23;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24;; Boston, MA 02111-1307, USA.
25
26;;; Commentary:
27
28;; This is another implementation of sexp-form regular expressions.
29;; It was unfortunately written without being aware of the Sregex
30;; package coming with Emacs, but as things stand, Rx completely
31;; covers all regexp features, which Sregex doesn't, doesn't suffer
32;; from the bugs mentioned in the commentary section of Sregex, and
33;; uses a nicer syntax (IMHO, of course :-).
34
ccfbe679
SM
35;; This significantly extended version of the original, is almost
36;; compatible with Sregex. The only incompatibility I (fx) know of is
37;; that the `repeat' form can't have multiple regexp args.
38
39;; Now alternative forms are provided for a degree of compatibility
40;; with Shivers' attempted definitive SRE notation
41;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not
42;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
43;; ,<exp>, (word ...), word+, posix-string, and character class forms.
44;; Some forms are inconsistent with SRE, either for historical reasons
45;; or because of the implementation -- simple translation into Emacs
46;; regexp strings. These include: any, word. Also, case-sensitivity
47;; and greediness are controlled by variables external to the regexp,
48;; and you need to feed the forms to the `posix-' functions to get
49;; SRE's POSIX semantics. There are probably more difficulties.
50
12c64503
GM
51;; Rx translates a sexp notation for regular expressions into the
52;; usual string notation. The translation can be done at compile-time
53;; by using the `rx' macro. It can be done at run-time by calling
54;; function `rx-to-string'. See the documentation of `rx' for a
55;; complete description of the sexp notation.
56;;
57;; Some examples of string regexps and their sexp counterparts:
58;;
59;; "^[a-z]*"
60;; (rx (and line-start (0+ (in "a-z"))))
61;;
62;; "\n[^ \t]"
63;; (rx (and "\n" (not blank))), or
64;; (rx (and "\n" (not (any " \t"))))
65;;
66;; "\\*\\*\\* EOOH \\*\\*\\*\n"
67;; (rx "*** EOOH ***\n")
68;;
69;; "\\<\\(catch\\|finally\\)\\>[^_]"
70;; (rx (and word-start (submatch (or "catch" "finally")) word-end
71;; (not (any ?_))))
72;;
73;; "[ \t\n]*:\\([^:]+\\|$\\)"
74;; (rx (and (zero-or-more (in " \t\n")) ":"
75;; (submatch (or line-end (one-or-more (not (any ?:)))))))
76;;
77;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
78;; (rx (and line-start
79;; "content-transfer-encoding:"
c53f9b3b 80;; (+ (? ?\n)) blank
12c64503 81;; "quoted-printable"
c53f9b3b 82;; (+ (? ?\n)) blank))
12c64503
GM
83;;
84;; (concat "^\\(?:" something-else "\\)")
85;; (rx (and line-start (eval something-else))), statically or
86;; (rx-to-string '(and line-start ,something-else)), dynamically.
87;;
88;; (regexp-opt '(STRING1 STRING2 ...))
89;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
90;; calls `regexp-opt' as needed.
91;;
92;; "^;;\\s-*\n\\|^\n"
93;; (rx (or (and line-start ";;" (0+ space) ?\n)
94;; (and line-start ?\n)))
95;;
96;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
a1506d29
JB
97;; (rx (and "$Id: "
98;; (1+ (not (in " ")))
12c64503
GM
99;; " "
100;; (submatch (1+ (not (in " "))))
c53f9b3b 101;; " "))
12c64503
GM
102;;
103;; "\\\\\\\\\\[\\w+"
104;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
105;;
106;; etc.
107
108;;; History:
a1506d29 109;;
12c64503
GM
110
111;;; Code:
112
12c64503
GM
113(defconst rx-constituents
114 '((and . (rx-and 1 nil))
ccfbe679
SM
115 (seq . and) ; SRE
116 (: . and) ; SRE
117 (sequence . and) ; sregex
12c64503 118 (or . (rx-or 1 nil))
ccfbe679 119 (| . or) ; SRE
12c64503 120 (not-newline . ".")
ccfbe679 121 (nonl . not-newline) ; SRE
12c64503 122 (anything . ".\\|\n")
ccfbe679 123 (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
12c64503 124 (in . any)
ccfbe679
SM
125 (char . any) ; sregex
126 (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex
12c64503 127 (not . (rx-not 1 1 rx-check-not))
ccfbe679
SM
128 ;; Partially consistent with sregex, whose `repeat' is like our
129 ;; `**'. (`repeat' with optional max arg and multiple sexp forms
130 ;; is ambiguous.)
12c64503 131 (repeat . (rx-repeat 2 3))
ccfbe679
SM
132 (= . (rx-= 2 nil)) ; SRE
133 (>= . (rx->= 2 nil)) ; SRE
134 (** . (rx-** 2 nil)) ; SRE
135 (submatch . (rx-submatch 1 nil)) ; SRE
12c64503 136 (group . submatch)
ccfbe679
SM
137 (zero-or-more . (rx-kleene 1 nil))
138 (one-or-more . (rx-kleene 1 nil))
139 (zero-or-one . (rx-kleene 1 nil))
140 (\? . zero-or-one) ; SRE
12c64503 141 (\?? . zero-or-one)
ccfbe679 142 (* . zero-or-more) ; SRE
12c64503
GM
143 (*? . zero-or-more)
144 (0+ . zero-or-more)
ccfbe679 145 (+ . one-or-more) ; SRE
12c64503
GM
146 (+? . one-or-more)
147 (1+ . one-or-more)
148 (optional . zero-or-one)
ccfbe679 149 (opt . zero-or-one) ; sregex
12c64503
GM
150 (minimal-match . (rx-greedy 1 1))
151 (maximal-match . (rx-greedy 1 1))
740b7c2d 152 (backref . (rx-backref 1 1 rx-check-backref))
12c64503 153 (line-start . "^")
ccfbe679 154 (bol . line-start) ; SRE
12c64503 155 (line-end . "$")
ccfbe679 156 (eol . line-end) ; SRE
12c64503 157 (string-start . "\\`")
ccfbe679
SM
158 (bos . string-start) ; SRE
159 (bot . string-start) ; sregex
12c64503 160 (string-end . "\\'")
ccfbe679
SM
161 (eos . string-end) ; SRE
162 (eot . string-end) ; sregex
12c64503
GM
163 (buffer-start . "\\`")
164 (buffer-end . "\\'")
165 (point . "\\=")
166 (word-start . "\\<")
ccfbe679 167 (bow . word-start) ; SRE
12c64503 168 (word-end . "\\>")
ccfbe679 169 (eow . word-end) ; SRE
12c64503 170 (word-boundary . "\\b")
ccfbe679 171 (not-word-boundary . "\\B") ; sregex
b62c13c2
SM
172 (symbol-start . "\\_<")
173 (symbol-end . "\\_>")
12c64503 174 (syntax . (rx-syntax 1 1))
ccfbe679 175 (not-syntax . (rx-not-syntax 1 1)) ; sregex
12c64503
GM
176 (category . (rx-category 1 1 rx-check-category))
177 (eval . (rx-eval 1 1))
178 (regexp . (rx-regexp 1 1 stringp))
179 (digit . "[[:digit:]]")
ccfbe679
SM
180 (numeric . digit) ; SRE
181 (num . digit) ; SRE
182 (control . "[[:cntrl:]]") ; SRE
183 (cntrl . control) ; SRE
184 (hex-digit . "[[:xdigit:]]") ; SRE
185 (hex . hex-digit) ; SRE
186 (xdigit . hex-digit) ; SRE
187 (blank . "[[:blank:]]") ; SRE
188 (graphic . "[[:graph:]]") ; SRE
189 (graph . graphic) ; SRE
190 (printing . "[[:print:]]") ; SRE
191 (print . printing) ; SRE
192 (alphanumeric . "[[:alnum:]]") ; SRE
193 (alnum . alphanumeric) ; SRE
12c64503 194 (letter . "[[:alpha:]]")
ccfbe679
SM
195 (alphabetic . letter) ; SRE
196 (alpha . letter) ; SRE
197 (ascii . "[[:ascii:]]") ; SRE
12c64503 198 (nonascii . "[[:nonascii:]]")
ccfbe679
SM
199 (lower . "[[:lower:]]") ; SRE
200 (lower-case . lower) ; SRE
201 (punctuation . "[[:punct:]]") ; SRE
202 (punct . punctuation) ; SRE
203 (space . "[[:space:]]") ; SRE
204 (whitespace . space) ; SRE
205 (white . space) ; SRE
206 (upper . "[[:upper:]]") ; SRE
207 (upper-case . upper) ; SRE
208 (word . "[[:word:]]") ; inconsistent with SRE
209 (wordchar . word) ; sregex
210 (not-wordchar . "[^[:word:]]") ; sregex (use \\W?)
211 )
12c64503
GM
212 "Alist of sexp form regexp constituents.
213Each element of the alist has the form (SYMBOL . DEFN).
214SYMBOL is a valid constituent of sexp regular expressions.
215If DEFN is a string, SYMBOL is translated into DEFN.
216If DEFN is a symbol, use the definition of DEFN, recursively.
217Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
218FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS
219are the minimum and maximum number of arguments the function-form
220sexp constituent SYMBOL may have in sexp regular expressions.
221MAX-ARGS nil means no limit. PREDICATE, if specified, means that
222all arguments must satisfy PREDICATE.")
223
224
225(defconst rx-syntax
226 '((whitespace . ?-)
227 (punctuation . ?.)
228 (word . ?w)
229 (symbol . ?_)
230 (open-parenthesis . ?\()
231 (close-parenthesis . ?\))
232 (expression-prefix . ?\')
233 (string-quote . ?\")
234 (paired-delimiter . ?$)
235 (escape . ?\\)
236 (character-quote . ?/)
237 (comment-start . ?<)
740b7c2d
EZ
238 (comment-end . ?>)
239 (string-delimiter . ?|)
09c774f7 240 (comment-delimiter . ?!))
12c64503
GM
241 "Alist mapping Rx syntax symbols to syntax characters.
242Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
243symbol in `(syntax SYMBOL)', and CHAR is the syntax character
244corresponding to SYMBOL, as it would be used with \\s or \\S in
245regular expressions.")
246
247
248(defconst rx-categories
249 '((consonant . ?0)
250 (base-vowel . ?1)
251 (upper-diacritical-mark . ?2)
252 (lower-diacritical-mark . ?3)
253 (tone-mark . ?4)
254 (symbol . ?5)
255 (digit . ?6)
256 (vowel-modifying-diacritical-mark . ?7)
257 (vowel-sign . ?8)
258 (semivowel-lower . ?9)
259 (not-at-end-of-line . ?<)
260 (not-at-beginning-of-line . ?>)
261 (alpha-numeric-two-byte . ?A)
262 (chinse-two-byte . ?C)
263 (greek-two-byte . ?G)
264 (japanese-hiragana-two-byte . ?H)
265 (indian-two-byte . ?I)
266 (japanese-katakana-two-byte . ?K)
267 (korean-hangul-two-byte . ?N)
268 (cyrillic-two-byte . ?Y)
740b7c2d 269 (combining-diacritic . ?^)
12c64503
GM
270 (ascii . ?a)
271 (arabic . ?b)
272 (chinese . ?c)
273 (ethiopic . ?e)
274 (greek . ?g)
275 (korean . ?h)
276 (indian . ?i)
277 (japanese . ?j)
278 (japanese-katakana . ?k)
279 (latin . ?l)
280 (lao . ?o)
281 (tibetan . ?q)
282 (japanese-roman . ?r)
283 (thai . ?t)
284 (vietnamese . ?v)
285 (hebrew . ?w)
286 (cyrillic . ?y)
287 (can-break . ?|))
288 "Alist mapping symbols to category characters.
289Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
290symbol in `(category SYMBOL)', and CHAR is the category character
291corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
292regular expression strings.")
293
294
295(defvar rx-greedy-flag t
296 "Non-nil means produce greedy regular expressions for `zero-or-one',
297`zero-or-more', and `one-or-more'. Dynamically bound.")
298
299
300(defun rx-info (op)
301 "Return parsing/code generation info for OP.
302If OP is the space character ASCII 32, return info for the symbol `?'.
303If OP is the character `?', return info for the symbol `??'.
304See also `rx-constituents'."
305 (cond ((eq op ? ) (setq op '\?))
306 ((eq op ??) (setq op '\??)))
307 (while (and (not (null op)) (symbolp op))
308 (setq op (cdr (assq op rx-constituents))))
309 op)
a1506d29 310
12c64503
GM
311
312(defun rx-check (form)
313 "Check FORM according to its car's parsing info."
ccfbe679
SM
314 (unless (listp form)
315 (error "rx `%s' needs argument(s)" form))
12c64503
GM
316 (let* ((rx (rx-info (car form)))
317 (nargs (1- (length form)))
318 (min-args (nth 1 rx))
319 (max-args (nth 2 rx))
320 (type-pred (nth 3 rx)))
321 (when (and (not (null min-args))
322 (< nargs min-args))
740b7c2d 323 (error "rx form `%s' requires at least %d args"
12c64503
GM
324 (car form) min-args))
325 (when (and (not (null max-args))
326 (> nargs max-args))
740b7c2d 327 (error "rx form `%s' accepts at most %d args"
12c64503
GM
328 (car form) max-args))
329 (when (not (null type-pred))
330 (dolist (sub-form (cdr form))
331 (unless (funcall type-pred sub-form)
740b7c2d 332 (error "rx form `%s' requires args satisfying `%s'"
12c64503
GM
333 (car form) type-pred))))))
334
335
336(defun rx-and (form)
337 "Parse and produce code from FORM.
338FORM is of the form `(and FORM1 ...)'."
339 (rx-check form)
c53f9b3b
RS
340 (concat "\\(?:"
341 (mapconcat
342 (function (lambda (x) (rx-to-string x 'no-group)))
343 (cdr form) nil)
344 "\\)"))
12c64503
GM
345
346
347(defun rx-or (form)
348 "Parse and produce code from FORM, which is `(or FORM1 ...)'."
349 (rx-check form)
350 (let ((all-args-strings t))
351 (dolist (arg (cdr form))
352 (unless (stringp arg)
353 (setq all-args-strings nil)))
cdddbfd2
EZ
354 (concat "\\(?:"
355 (if all-args-strings
356 (regexp-opt (cdr form))
357 (mapconcat #'rx-to-string (cdr form) "\\|"))
358 "\\)")))
12c64503
GM
359
360
09c774f7 361(defvar rx-bracket) ; dynamically bound in `rx-any'
12c64503
GM
362
363(defun rx-check-any (arg)
364 "Check arg ARG for Rx `any'."
ccfbe679
SM
365 (if (integerp arg)
366 (setq arg (string arg)))
367 (when (stringp arg)
368 (if (zerop (length arg))
369 (error "String arg for Rx `any' must not be empty"))
370 ;; Quote ^ at start; don't bother to check whether this is first arg.
371 (if (eq ?^ (aref arg 0))
372 (setq arg (concat "\\" arg)))
373 ;; Remove ] and set flag for adding it to start of overall result.
374 (when (string-match "]" arg)
375 (setq arg (replace-regexp-in-string "]" "" arg)
09c774f7 376 rx-bracket "]")))
ccfbe679
SM
377 (when (symbolp arg)
378 (let ((translation (condition-case nil
379 (rx-to-string arg 'no-group)
380 (error nil))))
381 (unless translation (error "Invalid char class `%s' in Rx `any'" arg))
382 (setq arg (substring translation 1 -1)))) ; strip outer brackets
383 ;; sregex compatibility
384 (when (and (integerp (car-safe arg))
385 (integerp (cdr-safe arg)))
386 (setq arg (string (car arg) ?- (cdr arg))))
387 (unless (stringp arg)
388 (error "rx `any' requires string, character, char pair or char class args"))
389 arg)
12c64503
GM
390
391(defun rx-any (form)
ccfbe679
SM
392 "Parse and produce code from FORM, which is `(any ARG ...)'.
393ARG is optional."
12c64503 394 (rx-check form)
09c774f7
SM
395 (let* ((rx-bracket nil)
396 (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `rx-bracket'
ccfbe679
SM
397 ;; If there was a ?- in the form, move it to the front to avoid
398 ;; accidental range.
399 (if (member "-" args)
400 (setq args (cons "-" (delete "-" args))))
09c774f7 401 (apply #'concat "[" rx-bracket (append args '("]")))))
12c64503
GM
402
403
740b7c2d
EZ
404(defun rx-check-not (arg)
405 "Check arg ARG for Rx `not'."
ccfbe679
SM
406 (unless (or (and (symbolp arg)
407 (string-match "\\`\\[\\[:[-a-z]:]]\\'"
408 (condition-case nil
409 (rx-to-string arg 'no-group)
410 (error ""))))
411 (eq arg 'word-boundary)
412 (and (consp arg)
413 (memq (car arg) '(not any in syntax category))))
414 (error "rx `not' syntax error: %s" arg))
415 t)
12c64503
GM
416
417
418(defun rx-not (form)
419 "Parse and produce code from FORM. FORM is `(not ...)'."
420 (rx-check form)
062a9fce
EZ
421 (let ((result (rx-to-string (cadr form) 'no-group))
422 case-fold-search)
12c64503
GM
423 (cond ((string-match "\\`\\[^" result)
424 (if (= (length result) 4)
425 (substring result 2 3)
426 (concat "[" (substring result 2))))
ccfbe679 427 ((eq ?\[ (aref result 0))
12c64503 428 (concat "[^" (substring result 1)))
ccfbe679
SM
429 ((string-match "\\`\\\\[scb]" result)
430 (concat (capitalize (substring result 0 2)) (substring result 2)))
12c64503
GM
431 (t
432 (concat "[^" result "]")))))
433
434
ccfbe679
SM
435(defun rx-not-char (form)
436 "Parse and produce code from FORM. FORM is `(not-char ...)'."
437 (rx-check form)
438 (rx-not `(not (in ,@(cdr form)))))
439
440
441(defun rx-not-syntax (form)
442 "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'."
443 (rx-check form)
444 (rx-not `(not (syntax ,@(cdr form)))))
445
446
447(defun rx-trans-forms (form &optional skip)
448 "If FORM's length is greater than two, transform it to length two.
449A form (HEAD REST ...) becomes (HEAD (and REST ...)).
450If SKIP is non-nil, allow that number of items after the head, i.e.
451`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
452 (unless skip (setq skip 0))
453 (let ((tail (nthcdr (1+ skip) form)))
454 (if (= (length tail) 1)
455 form
456 (let ((form (copy-sequence form)))
457 (setcdr (nthcdr skip form) (list (cons 'and tail)))
458 form))))
459
460
461(defun rx-= (form)
462 "Parse and produce code from FORM `(= N ...)'."
463 (rx-check form)
464 (setq form (rx-trans-forms form 1))
465 (unless (and (integerp (nth 1 form))
466 (> (nth 1 form) 0))
467 (error "rx `=' requires positive integer first arg"))
468 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
469
470
471(defun rx->= (form)
472 "Parse and produce code from FORM `(>= N ...)'."
473 (rx-check form)
474 (setq form (rx-trans-forms form 1))
475 (unless (and (integerp (nth 1 form))
476 (> (nth 1 form) 0))
477 (error "rx `>=' requires positive integer first arg"))
478 (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
479
480
481(defun rx-** (form)
482 "Parse and produce code from FORM `(** N M ...)'."
483 (rx-check form)
484 (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
485 (rx-to-string form))
486
487
12c64503
GM
488(defun rx-repeat (form)
489 "Parse and produce code from FORM.
490FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
491 (rx-check form)
492 (cond ((= (length form) 3)
493 (unless (and (integerp (nth 1 form))
494 (> (nth 1 form) 0))
740b7c2d 495 (error "rx `repeat' requires positive integer first arg"))
12c64503
GM
496 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
497 ((or (not (integerp (nth 2 form)))
498 (< (nth 2 form) 0)
499 (not (integerp (nth 1 form)))
500 (< (nth 1 form) 0)
501 (< (nth 2 form) (nth 1 form)))
740b7c2d 502 (error "rx `repeat' range error"))
12c64503
GM
503 (t
504 (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form))
505 (nth 1 form) (nth 2 form)))))
506
507
508(defun rx-submatch (form)
509 "Parse and produce code from FORM, which is `(submatch ...)'."
c53f9b3b
RS
510 (concat "\\("
511 (mapconcat (function (lambda (x) (rx-to-string x 'no-group)))
512 (cdr form) nil)
513 "\\)"))
12c64503 514
740b7c2d
EZ
515(defun rx-backref (form)
516 "Parse and produce code from FORM, which is `(backref N)'."
517 (rx-check form)
518 (format "\\%d" (nth 1 form)))
519
520(defun rx-check-backref (arg)
521 "Check arg ARG for Rx `backref'."
522 (or (and (integerp arg) (>= arg 1) (<= arg 9))
523 (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
524
12c64503
GM
525(defun rx-kleene (form)
526 "Parse and produce code from FORM.
527FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
a1506d29 528`zero-or-more' etc. operators.
12c64503
GM
529If OP is one of `*', `+', `?', produce a greedy regexp.
530If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
531If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
532is non-nil."
533 (rx-check form)
ccfbe679 534 (setq form (rx-trans-forms form))
12c64503
GM
535 (let ((suffix (cond ((memq (car form) '(* + ? )) "")
536 ((memq (car form) '(*? +? ??)) "?")
537 (rx-greedy-flag "")
538 (t "?")))
539 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
540 ((memq (car form) '(+ +? 1+ one-or-more)) "+")
c53f9b3b
RS
541 (t "?")))
542 (result (rx-to-string (cadr form) 'no-group)))
543 (if (not (rx-atomic-p result))
544 (setq result (concat "\\(?:" result "\\)")))
545 (concat result op suffix)))
546
547(defun rx-atomic-p (r)
548 "Return non-nil if regexp string R is atomic.
549An atomic regexp R is one such that a suffix operator
550appended to R will apply to all of R. For example, \"a\"
551\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
552\"[ab]c\", and \"ab\\|ab*c\" are not atomic.
553
554This function may return false negatives, but it will not
555return false positives. It is nevertheless useful in
556situations where an efficiency shortcut can be taken iff a
557regexp is atomic. The function can be improved to detect
558more cases of atomic regexps. Presently, this function
559detects the following categories of atomic regexp;
560
561 a group or shy group: \\(...\\)
562 a character class: [...]
563 a single character: a
564
565On the other hand, false negatives will be returned for
566regexps that are atomic but end in operators, such as
567\"a+\". I think these are rare. Probably such cases could
568be detected without much effort. A guarantee of no false
569negatives would require a theoretic specification of the set
570of all atomic regexps."
571 (let ((l (length r)))
572 (or (equal l 1)
573 (and (>= l 6)
574 (equal (substring r 0 2) "\\(")
575 (equal (substring r -2) "\\)"))
576 (and (>= l 2)
577 (equal (substring r 0 1) "[")
578 (equal (substring r -1) "]")))))
12c64503
GM
579
580
581(defun rx-syntax (form)
582 "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
583 (rx-check form)
09c774f7
SM
584 (let* ((sym (cadr form))
585 (syntax (assq sym rx-syntax)))
12c64503 586 (unless syntax
09c774f7
SM
587 ;; Try sregex compatibility.
588 (let ((name (symbol-name sym)))
589 (if (= 1 (length name))
590 (setq syntax (rassq (aref name 0) rx-syntax))))
591 (unless syntax
592 (error "Unknown rx syntax `%s'" (cadr form))))
12c64503
GM
593 (format "\\s%c" (cdr syntax))))
594
595
596(defun rx-check-category (form)
597 "Check the argument FORM of a `(category FORM)'."
598 (unless (or (integerp form)
599 (cdr (assq form rx-categories)))
600 (error "Unknown category `%s'" form))
601 t)
a1506d29 602
12c64503
GM
603
604(defun rx-category (form)
ccfbe679 605 "Parse and produce code from FORM, which is `(category SYMBOL)'."
12c64503
GM
606 (rx-check form)
607 (let ((char (if (integerp (cadr form))
608 (cadr form)
609 (cdr (assq (cadr form) rx-categories)))))
610 (format "\\c%c" char)))
611
612
613(defun rx-eval (form)
614 "Parse and produce code from FORM, which is `(eval FORM)'."
615 (rx-check form)
616 (rx-to-string (eval (cadr form))))
617
618
619(defun rx-greedy (form)
740b7c2d
EZ
620 "Parse and produce code from FORM.
621If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
622`+', and `?' operators will be used in FORM1. If FORM is
623'(maximal-match FORM1)', greedy operators will be used."
12c64503
GM
624 (rx-check form)
625 (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
626 (rx-to-string (cadr form))))
627
628
629(defun rx-regexp (form)
630 "Parse and produce code from FORM, which is `(regexp STRING)'."
631 (rx-check form)
632 (concat "\\(?:" (cadr form) "\\)"))
633
634
635;;;###autoload
636(defun rx-to-string (form &optional no-group)
637 "Parse and produce code for regular expression FORM.
638FORM is a regular expression in sexp form.
639NO-GROUP non-nil means don't put shy groups around the result."
640 (cond ((stringp form)
641 (regexp-quote form))
642 ((integerp form)
643 (regexp-quote (char-to-string form)))
644 ((symbolp form)
645 (let ((info (rx-info form)))
646 (cond ((stringp info)
647 info)
648 ((null info)
740b7c2d 649 (error "Unknown rx form `%s'" form))
a1506d29 650 (t
12c64503
GM
651 (funcall (nth 0 info) form)))))
652 ((consp form)
653 (let ((info (rx-info (car form))))
654 (unless (consp info)
740b7c2d 655 (error "Unknown rx form `%s'" (car form)))
12c64503
GM
656 (let ((result (funcall (nth 0 info) form)))
657 (if (or no-group (string-match "\\`\\\\[(]" result))
658 result
659 (concat "\\(?:" result "\\)")))))
660 (t
740b7c2d 661 (error "rx syntax error at `%s'" form))))
12c64503
GM
662
663
664;;;###autoload
ccfbe679
SM
665(defmacro rx (&rest regexps)
666 "Translate regular expressions REGEXPS in sexp form to a regexp string.
667REGEXPS is a non-empty sequence of forms of the sort listed below.
12c64503
GM
668See also `rx-to-string' for how to do such a translation at run-time.
669
670The following are valid subforms of regular expressions in sexp
671notation.
672
673STRING
674 matches string STRING literally.
675
676CHAR
677 matches character CHAR literally.
678
ccfbe679 679`not-newline', `nonl'
12c64503
GM
680 matches any character except a newline.
681 .
682`anything'
683 matches any character
684
ccfbe679
SM
685`(any SET ...)'
686`(in SET ...)'
687`(char SET ...)'
688 matches any character in SET .... SET may be a character or string.
12c64503 689 Ranges of characters can be specified as `A-Z' in strings.
ccfbe679 690 Ranges may also be specified as conses like `(?A . ?Z)'.
12c64503 691
ccfbe679
SM
692 SET may also be the name of a character class: `digit',
693 `control', `hex-digit', `blank', `graph', `print', `alnum',
694 `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
695 `word', or one of their synonyms.
12c64503 696
ccfbe679
SM
697`(not (any SET ...))'
698 matches any character not in SET ...
12c64503 699
ccfbe679 700`line-start', `bol'
12c64503
GM
701 matches the empty string, but only at the beginning of a line
702 in the text being matched
703
ccfbe679 704`line-end', `eol'
12c64503
GM
705 is similar to `line-start' but matches only at the end of a line
706
ccfbe679 707`string-start', `bos', `bot'
12c64503
GM
708 matches the empty string, but only at the beginning of the
709 string being matched against.
710
ccfbe679 711`string-end', `eos', `eot'
12c64503
GM
712 matches the empty string, but only at the end of the
713 string being matched against.
714
715`buffer-start'
716 matches the empty string, but only at the beginning of the
ccfbe679 717 buffer being matched against. Actually equivalent to `string-start'.
12c64503
GM
718
719`buffer-end'
720 matches the empty string, but only at the end of the
ccfbe679 721 buffer being matched against. Actually equivalent to `string-end'.
12c64503
GM
722
723`point'
724 matches the empty string, but only at point.
725
ccfbe679 726`word-start', `bow'
12c64503
GM
727 matches the empty string, but only at the beginning or end of a
728 word.
729
ccfbe679 730`word-end', `eow'
12c64503
GM
731 matches the empty string, but only at the end of a word.
732
733`word-boundary'
734 matches the empty string, but only at the beginning or end of a
735 word.
736
737`(not word-boundary)'
ccfbe679 738`not-word-boundary'
12c64503
GM
739 matches the empty string, but not at the beginning or end of a
740 word.
741
ccfbe679 742`digit', `numeric', `num'
12c64503
GM
743 matches 0 through 9.
744
ccfbe679 745`control', `cntrl'
12c64503
GM
746 matches ASCII control characters.
747
ccfbe679 748`hex-digit', `hex', `xdigit'
12c64503
GM
749 matches 0 through 9, a through f and A through F.
750
751`blank'
752 matches space and tab only.
753
ccfbe679 754`graphic', `graph'
12c64503
GM
755 matches graphic characters--everything except ASCII control chars,
756 space, and DEL.
757
ccfbe679 758`printing', `print'
12c64503
GM
759 matches printing characters--everything except ASCII control chars
760 and DEL.
761
ccfbe679 762`alphanumeric', `alnum'
12c64503
GM
763 matches letters and digits. (But at present, for multibyte characters,
764 it matches anything that has word syntax.)
765
ccfbe679 766`letter', `alphabetic', `alpha'
12c64503
GM
767 matches letters. (But at present, for multibyte characters,
768 it matches anything that has word syntax.)
769
770`ascii'
771 matches ASCII (unibyte) characters.
772
773`nonascii'
774 matches non-ASCII (multibyte) characters.
775
ccfbe679 776`lower', `lower-case'
12c64503
GM
777 matches anything lower-case.
778
ccfbe679 779`upper', `upper-case'
12c64503
GM
780 matches anything upper-case.
781
ccfbe679 782`punctuation', `punct'
12c64503
GM
783 matches punctuation. (But at present, for multibyte characters,
784 it matches anything that has non-word syntax.)
785
ccfbe679 786`space', `whitespace', `white'
12c64503
GM
787 matches anything that has whitespace syntax.
788
ccfbe679 789`word', `wordchar'
12c64503
GM
790 matches anything that has word syntax.
791
ccfbe679
SM
792`not-wordchar'
793 matches anything that has non-word syntax.
794
12c64503
GM
795`(syntax SYNTAX)'
796 matches a character with syntax SYNTAX. SYNTAX must be one
ccfbe679
SM
797 of the following symbols, or a symbol corresponding to the syntax
798 character, e.g. `\\.' for `\\s.'.
12c64503
GM
799
800 `whitespace' (\\s- in string notation)
801 `punctuation' (\\s.)
802 `word' (\\sw)
803 `symbol' (\\s_)
804 `open-parenthesis' (\\s()
805 `close-parenthesis' (\\s))
806 `expression-prefix' (\\s')
807 `string-quote' (\\s\")
808 `paired-delimiter' (\\s$)
809 `escape' (\\s\\)
810 `character-quote' (\\s/)
811 `comment-start' (\\s<)
812 `comment-end' (\\s>)
740b7c2d
EZ
813 `string-delimiter' (\\s|)
814 `comment-delimiter' (\\s!)
12c64503
GM
815
816`(not (syntax SYNTAX))'
ccfbe679 817 matches a character that doesn't have syntax SYNTAX.
12c64503
GM
818
819`(category CATEGORY)'
820 matches a character with category CATEGORY. CATEGORY must be
821 either a character to use for C, or one of the following symbols.
822
823 `consonant' (\\c0 in string notation)
824 `base-vowel' (\\c1)
825 `upper-diacritical-mark' (\\c2)
826 `lower-diacritical-mark' (\\c3)
827 `tone-mark' (\\c4)
828 `symbol' (\\c5)
829 `digit' (\\c6)
830 `vowel-modifying-diacritical-mark' (\\c7)
831 `vowel-sign' (\\c8)
832 `semivowel-lower' (\\c9)
833 `not-at-end-of-line' (\\c<)
834 `not-at-beginning-of-line' (\\c>)
835 `alpha-numeric-two-byte' (\\cA)
836 `chinse-two-byte' (\\cC)
837 `greek-two-byte' (\\cG)
838 `japanese-hiragana-two-byte' (\\cH)
839 `indian-tow-byte' (\\cI)
840 `japanese-katakana-two-byte' (\\cK)
841 `korean-hangul-two-byte' (\\cN)
842 `cyrillic-two-byte' (\\cY)
ccfbe679 843 `combining-diacritic' (\\c^)
12c64503
GM
844 `ascii' (\\ca)
845 `arabic' (\\cb)
846 `chinese' (\\cc)
847 `ethiopic' (\\ce)
848 `greek' (\\cg)
849 `korean' (\\ch)
850 `indian' (\\ci)
851 `japanese' (\\cj)
852 `japanese-katakana' (\\ck)
853 `latin' (\\cl)
854 `lao' (\\co)
855 `tibetan' (\\cq)
856 `japanese-roman' (\\cr)
857 `thai' (\\ct)
858 `vietnamese' (\\cv)
859 `hebrew' (\\cw)
860 `cyrillic' (\\cy)
861 `can-break' (\\c|)
862
863`(not (category CATEGORY))'
ccfbe679 864 matches a character that doesn't have category CATEGORY.
12c64503
GM
865
866`(and SEXP1 SEXP2 ...)'
ccfbe679
SM
867`(: SEXP1 SEXP2 ...)'
868`(seq SEXP1 SEXP2 ...)'
869`(sequence SEXP1 SEXP2 ...)'
12c64503
GM
870 matches what SEXP1 matches, followed by what SEXP2 matches, etc.
871
872`(submatch SEXP1 SEXP2 ...)'
ccfbe679 873`(group SEXP1 SEXP2 ...)'
12c64503
GM
874 like `and', but makes the match accessible with `match-end',
875 `match-beginning', and `match-string'.
876
877`(group SEXP1 SEXP2 ...)'
878 another name for `submatch'.
879
880`(or SEXP1 SEXP2 ...)'
ccfbe679 881`(| SEXP1 SEXP2 ...)'
12c64503
GM
882 matches anything that matches SEXP1 or SEXP2, etc. If all
883 args are strings, use `regexp-opt' to optimize the resulting
884 regular expression.
885
886`(minimal-match SEXP)'
887 produce a non-greedy regexp for SEXP. Normally, regexps matching
740b7c2d 888 zero or more occurrences of something are \"greedy\" in that they
12c64503
GM
889 match as much as they can, as long as the overall regexp can
890 still match. A non-greedy regexp matches as little as possible.
891
892`(maximal-match SEXP)'
0a6cac62 893 produce a greedy regexp for SEXP. This is the default.
12c64503 894
ccfbe679
SM
895Below, `SEXP ...' represents a sequence of regexp forms, treated as if
896enclosed in `(and ...)'.
12c64503 897
ccfbe679
SM
898`(zero-or-more SEXP ...)'
899`(0+ SEXP ...)'
900 matches zero or more occurrences of what SEXP ... matches.
12c64503 901
ccfbe679
SM
902`(* SEXP ...)'
903 like `zero-or-more', but always produces a greedy regexp, independent
904 of `rx-greedy-flag'.
12c64503 905
ccfbe679
SM
906`(*? SEXP ...)'
907 like `zero-or-more', but always produces a non-greedy regexp,
908 independent of `rx-greedy-flag'.
a1506d29 909
ccfbe679
SM
910`(one-or-more SEXP ...)'
911`(1+ SEXP ...)'
912 matches one or more occurrences of SEXP ...
12c64503 913
ccfbe679 914`(+ SEXP ...)'
12c64503
GM
915 like `one-or-more', but always produces a greedy regexp.
916
ccfbe679 917`(+? SEXP ...)'
12c64503
GM
918 like `one-or-more', but always produces a non-greedy regexp.
919
ccfbe679
SM
920`(zero-or-one SEXP ...)'
921`(optional SEXP ...)'
922`(opt SEXP ...)'
12c64503 923 matches zero or one occurrences of A.
a1506d29 924
ccfbe679 925`(? SEXP ...)'
12c64503
GM
926 like `zero-or-one', but always produces a greedy regexp.
927
ccfbe679 928`(?? SEXP ...)'
12c64503
GM
929 like `zero-or-one', but always produces a non-greedy regexp.
930
931`(repeat N SEXP)'
ccfbe679
SM
932`(= N SEXP ...)'
933 matches N occurrences.
934
935`(>= N SEXP ...)'
936 matches N or more occurrences.
12c64503
GM
937
938`(repeat N M SEXP)'
ccfbe679
SM
939`(** N M SEXP ...)'
940 matches N to M occurrences.
941
942`(backref N)'
943 matches what was matched previously by submatch N.
12c64503 944
942269e7
JB
945`(backref N)'
946 matches what was matched previously by submatch N.
947
05ec033b
EZ
948`(backref N)'
949 matches what was matched previously by submatch N.
950
12c64503 951`(eval FORM)'
942269e7
JB
952 evaluate FORM and insert result. If result is a string,
953 `regexp-quote' it.
12c64503
GM
954
955`(regexp REGEXP)'
942269e7 956 include REGEXP in string notation in the result."
ccfbe679
SM
957 (cond ((null regexps)
958 (error "No regexp"))
959 ((cdr regexps)
960 (rx-to-string `(and ,@regexps) t))
961 (t
962 (rx-to-string (car regexps) t))))
963\f
964;; ;; sregex.el replacement
965
966;; ;;;###autoload (provide 'sregex)
967;; ;;;###autoload (autoload 'sregex "rx")
968;; (defalias 'sregex 'rx-to-string)
969;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
970;; (defalias 'sregexq 'rx)
971\f
12c64503
GM
972(provide 'rx)
973
b62c13c2 974;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
12c64503 975;;; rx.el ends here