Add 2010 to copyright years.
[bpt/emacs.git] / lisp / emacs-lisp / rx.el
CommitLineData
12c64503
GM
1;;; rx.el --- sexp notation for regular expressions
2
ceb4c4d3 3;; Copyright (C) 2001, 2002, 2003, 2004, 2005,
114f9c96 4;; 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
12c64503
GM
5
6;; Author: Gerd Moellmann <gerd@gnu.org>
7;; Maintainer: FSF
8;; Keywords: strings, regexps, extensions
9
10;; This file is part of GNU Emacs.
11
d6cba7ae 12;; GNU Emacs is free software: you can redistribute it and/or modify
12c64503 13;; it under the terms of the GNU General Public License as published by
d6cba7ae
GM
14;; the Free Software Foundation, either version 3 of the License, or
15;; (at your option) any later version.
12c64503
GM
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
d6cba7ae 23;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
12c64503
GM
24
25;;; Commentary:
26
27;; This is another implementation of sexp-form regular expressions.
28;; It was unfortunately written without being aware of the Sregex
29;; package coming with Emacs, but as things stand, Rx completely
30;; covers all regexp features, which Sregex doesn't, doesn't suffer
31;; from the bugs mentioned in the commentary section of Sregex, and
32;; uses a nicer syntax (IMHO, of course :-).
33
ccfbe679
SM
34;; This significantly extended version of the original, is almost
35;; compatible with Sregex. The only incompatibility I (fx) know of is
36;; that the `repeat' form can't have multiple regexp args.
37
38;; Now alternative forms are provided for a degree of compatibility
39;; with Shivers' attempted definitive SRE notation
40;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not
41;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
42;; ,<exp>, (word ...), word+, posix-string, and character class forms.
43;; Some forms are inconsistent with SRE, either for historical reasons
44;; or because of the implementation -- simple translation into Emacs
45;; regexp strings. These include: any, word. Also, case-sensitivity
46;; and greediness are controlled by variables external to the regexp,
47;; and you need to feed the forms to the `posix-' functions to get
48;; SRE's POSIX semantics. There are probably more difficulties.
49
12c64503
GM
50;; Rx translates a sexp notation for regular expressions into the
51;; usual string notation. The translation can be done at compile-time
52;; by using the `rx' macro. It can be done at run-time by calling
53;; function `rx-to-string'. See the documentation of `rx' for a
54;; complete description of the sexp notation.
55;;
56;; Some examples of string regexps and their sexp counterparts:
57;;
58;; "^[a-z]*"
59;; (rx (and line-start (0+ (in "a-z"))))
60;;
61;; "\n[^ \t]"
62;; (rx (and "\n" (not blank))), or
63;; (rx (and "\n" (not (any " \t"))))
64;;
65;; "\\*\\*\\* EOOH \\*\\*\\*\n"
66;; (rx "*** EOOH ***\n")
67;;
68;; "\\<\\(catch\\|finally\\)\\>[^_]"
69;; (rx (and word-start (submatch (or "catch" "finally")) word-end
70;; (not (any ?_))))
71;;
72;; "[ \t\n]*:\\([^:]+\\|$\\)"
73;; (rx (and (zero-or-more (in " \t\n")) ":"
74;; (submatch (or line-end (one-or-more (not (any ?:)))))))
75;;
76;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
77;; (rx (and line-start
78;; "content-transfer-encoding:"
c53f9b3b 79;; (+ (? ?\n)) blank
12c64503 80;; "quoted-printable"
c53f9b3b 81;; (+ (? ?\n)) blank))
12c64503
GM
82;;
83;; (concat "^\\(?:" something-else "\\)")
84;; (rx (and line-start (eval something-else))), statically or
85;; (rx-to-string '(and line-start ,something-else)), dynamically.
86;;
87;; (regexp-opt '(STRING1 STRING2 ...))
88;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
89;; calls `regexp-opt' as needed.
90;;
91;; "^;;\\s-*\n\\|^\n"
92;; (rx (or (and line-start ";;" (0+ space) ?\n)
93;; (and line-start ?\n)))
94;;
95;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
a1506d29
JB
96;; (rx (and "$Id: "
97;; (1+ (not (in " ")))
12c64503
GM
98;; " "
99;; (submatch (1+ (not (in " "))))
c53f9b3b 100;; " "))
12c64503
GM
101;;
102;; "\\\\\\\\\\[\\w+"
103;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
104;;
105;; etc.
106
107;;; History:
a1506d29 108;;
12c64503
GM
109
110;;; Code:
111
12c64503
GM
112(defconst rx-constituents
113 '((and . (rx-and 1 nil))
ccfbe679
SM
114 (seq . and) ; SRE
115 (: . and) ; SRE
116 (sequence . and) ; sregex
12c64503 117 (or . (rx-or 1 nil))
ccfbe679 118 (| . or) ; SRE
12c64503 119 (not-newline . ".")
ccfbe679 120 (nonl . not-newline) ; SRE
5dbe5c8f 121 (anything . (rx-anything 0 nil))
ccfbe679 122 (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
12c64503 123 (in . any)
ccfbe679
SM
124 (char . any) ; sregex
125 (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex
12c64503 126 (not . (rx-not 1 1 rx-check-not))
ccfbe679
SM
127 ;; Partially consistent with sregex, whose `repeat' is like our
128 ;; `**'. (`repeat' with optional max arg and multiple sexp forms
129 ;; is ambiguous.)
12c64503 130 (repeat . (rx-repeat 2 3))
ccfbe679
SM
131 (= . (rx-= 2 nil)) ; SRE
132 (>= . (rx->= 2 nil)) ; SRE
133 (** . (rx-** 2 nil)) ; SRE
134 (submatch . (rx-submatch 1 nil)) ; SRE
12c64503 135 (group . submatch)
ccfbe679
SM
136 (zero-or-more . (rx-kleene 1 nil))
137 (one-or-more . (rx-kleene 1 nil))
138 (zero-or-one . (rx-kleene 1 nil))
139 (\? . zero-or-one) ; SRE
12c64503 140 (\?? . zero-or-one)
ccfbe679 141 (* . zero-or-more) ; SRE
12c64503
GM
142 (*? . zero-or-more)
143 (0+ . zero-or-more)
ccfbe679 144 (+ . one-or-more) ; SRE
12c64503
GM
145 (+? . one-or-more)
146 (1+ . one-or-more)
147 (optional . zero-or-one)
ccfbe679 148 (opt . zero-or-one) ; sregex
12c64503
GM
149 (minimal-match . (rx-greedy 1 1))
150 (maximal-match . (rx-greedy 1 1))
740b7c2d 151 (backref . (rx-backref 1 1 rx-check-backref))
12c64503 152 (line-start . "^")
ccfbe679 153 (bol . line-start) ; SRE
12c64503 154 (line-end . "$")
ccfbe679 155 (eol . line-end) ; SRE
12c64503 156 (string-start . "\\`")
ccfbe679
SM
157 (bos . string-start) ; SRE
158 (bot . string-start) ; sregex
12c64503 159 (string-end . "\\'")
ccfbe679
SM
160 (eos . string-end) ; SRE
161 (eot . string-end) ; sregex
12c64503
GM
162 (buffer-start . "\\`")
163 (buffer-end . "\\'")
164 (point . "\\=")
165 (word-start . "\\<")
ccfbe679 166 (bow . word-start) ; SRE
12c64503 167 (word-end . "\\>")
ccfbe679 168 (eow . word-end) ; SRE
12c64503 169 (word-boundary . "\\b")
ccfbe679 170 (not-word-boundary . "\\B") ; sregex
b62c13c2
SM
171 (symbol-start . "\\_<")
172 (symbol-end . "\\_>")
12c64503 173 (syntax . (rx-syntax 1 1))
ccfbe679 174 (not-syntax . (rx-not-syntax 1 1)) ; sregex
12c64503
GM
175 (category . (rx-category 1 1 rx-check-category))
176 (eval . (rx-eval 1 1))
177 (regexp . (rx-regexp 1 1 stringp))
178 (digit . "[[:digit:]]")
ccfbe679
SM
179 (numeric . digit) ; SRE
180 (num . digit) ; SRE
181 (control . "[[:cntrl:]]") ; SRE
182 (cntrl . control) ; SRE
183 (hex-digit . "[[:xdigit:]]") ; SRE
184 (hex . hex-digit) ; SRE
185 (xdigit . hex-digit) ; SRE
186 (blank . "[[:blank:]]") ; SRE
187 (graphic . "[[:graph:]]") ; SRE
188 (graph . graphic) ; SRE
189 (printing . "[[:print:]]") ; SRE
190 (print . printing) ; SRE
191 (alphanumeric . "[[:alnum:]]") ; SRE
192 (alnum . alphanumeric) ; SRE
12c64503 193 (letter . "[[:alpha:]]")
ccfbe679
SM
194 (alphabetic . letter) ; SRE
195 (alpha . letter) ; SRE
196 (ascii . "[[:ascii:]]") ; SRE
12c64503 197 (nonascii . "[[:nonascii:]]")
ccfbe679
SM
198 (lower . "[[:lower:]]") ; SRE
199 (lower-case . lower) ; SRE
200 (punctuation . "[[:punct:]]") ; SRE
201 (punct . punctuation) ; SRE
202 (space . "[[:space:]]") ; SRE
203 (whitespace . space) ; SRE
204 (white . space) ; SRE
205 (upper . "[[:upper:]]") ; SRE
206 (upper-case . upper) ; SRE
207 (word . "[[:word:]]") ; inconsistent with SRE
208 (wordchar . word) ; sregex
5dbe5c8f 209 (not-wordchar . "\\W"))
12c64503
GM
210 "Alist of sexp form regexp constituents.
211Each element of the alist has the form (SYMBOL . DEFN).
212SYMBOL is a valid constituent of sexp regular expressions.
213If DEFN is a string, SYMBOL is translated into DEFN.
214If DEFN is a symbol, use the definition of DEFN, recursively.
215Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
216FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS
217are the minimum and maximum number of arguments the function-form
218sexp constituent SYMBOL may have in sexp regular expressions.
219MAX-ARGS nil means no limit. PREDICATE, if specified, means that
220all arguments must satisfy PREDICATE.")
221
222
223(defconst rx-syntax
224 '((whitespace . ?-)
225 (punctuation . ?.)
226 (word . ?w)
227 (symbol . ?_)
228 (open-parenthesis . ?\()
229 (close-parenthesis . ?\))
230 (expression-prefix . ?\')
231 (string-quote . ?\")
232 (paired-delimiter . ?$)
233 (escape . ?\\)
234 (character-quote . ?/)
235 (comment-start . ?<)
740b7c2d
EZ
236 (comment-end . ?>)
237 (string-delimiter . ?|)
09c774f7 238 (comment-delimiter . ?!))
12c64503
GM
239 "Alist mapping Rx syntax symbols to syntax characters.
240Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
241symbol in `(syntax SYMBOL)', and CHAR is the syntax character
242corresponding to SYMBOL, as it would be used with \\s or \\S in
243regular expressions.")
244
245
246(defconst rx-categories
247 '((consonant . ?0)
248 (base-vowel . ?1)
249 (upper-diacritical-mark . ?2)
250 (lower-diacritical-mark . ?3)
251 (tone-mark . ?4)
252 (symbol . ?5)
253 (digit . ?6)
254 (vowel-modifying-diacritical-mark . ?7)
255 (vowel-sign . ?8)
256 (semivowel-lower . ?9)
257 (not-at-end-of-line . ?<)
258 (not-at-beginning-of-line . ?>)
259 (alpha-numeric-two-byte . ?A)
260 (chinse-two-byte . ?C)
261 (greek-two-byte . ?G)
262 (japanese-hiragana-two-byte . ?H)
263 (indian-two-byte . ?I)
264 (japanese-katakana-two-byte . ?K)
265 (korean-hangul-two-byte . ?N)
266 (cyrillic-two-byte . ?Y)
740b7c2d 267 (combining-diacritic . ?^)
12c64503
GM
268 (ascii . ?a)
269 (arabic . ?b)
270 (chinese . ?c)
271 (ethiopic . ?e)
272 (greek . ?g)
273 (korean . ?h)
274 (indian . ?i)
275 (japanese . ?j)
276 (japanese-katakana . ?k)
277 (latin . ?l)
278 (lao . ?o)
279 (tibetan . ?q)
280 (japanese-roman . ?r)
281 (thai . ?t)
282 (vietnamese . ?v)
283 (hebrew . ?w)
284 (cyrillic . ?y)
285 (can-break . ?|))
286 "Alist mapping symbols to category characters.
287Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
288symbol in `(category SYMBOL)', and CHAR is the category character
289corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
290regular expression strings.")
291
292
293(defvar rx-greedy-flag t
294 "Non-nil means produce greedy regular expressions for `zero-or-one',
295`zero-or-more', and `one-or-more'. Dynamically bound.")
296
297
298(defun rx-info (op)
299 "Return parsing/code generation info for OP.
300If OP is the space character ASCII 32, return info for the symbol `?'.
301If OP is the character `?', return info for the symbol `??'.
302See also `rx-constituents'."
303 (cond ((eq op ? ) (setq op '\?))
304 ((eq op ??) (setq op '\??)))
305 (while (and (not (null op)) (symbolp op))
306 (setq op (cdr (assq op rx-constituents))))
307 op)
a1506d29 308
12c64503
GM
309
310(defun rx-check (form)
311 "Check FORM according to its car's parsing info."
ccfbe679
SM
312 (unless (listp form)
313 (error "rx `%s' needs argument(s)" form))
12c64503
GM
314 (let* ((rx (rx-info (car form)))
315 (nargs (1- (length form)))
316 (min-args (nth 1 rx))
317 (max-args (nth 2 rx))
318 (type-pred (nth 3 rx)))
319 (when (and (not (null min-args))
320 (< nargs min-args))
740b7c2d 321 (error "rx form `%s' requires at least %d args"
12c64503
GM
322 (car form) min-args))
323 (when (and (not (null max-args))
324 (> nargs max-args))
740b7c2d 325 (error "rx form `%s' accepts at most %d args"
12c64503
GM
326 (car form) max-args))
327 (when (not (null type-pred))
328 (dolist (sub-form (cdr form))
329 (unless (funcall type-pred sub-form)
740b7c2d 330 (error "rx form `%s' requires args satisfying `%s'"
12c64503
GM
331 (car form) type-pred))))))
332
333
5dbe5c8f
CY
334(defun rx-group-if (regexp group)
335 "Put shy groups around REGEXP if seemingly necessary when GROUP
336is non-nil."
337 (cond
338 ;; for some repetition
339 ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
340 ;; for concatenation
341 ((eq group ':)
342 (if (rx-atomic-p
343 (if (string-match
344 "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
345 (substring regexp 0 (match-beginning 0))
346 regexp))
347 (setq group nil)))
348 ;; for OR
349 ((eq group '|) (setq group nil))
350 ;; do anyway
351 ((eq group t))
352 ((rx-atomic-p regexp t) (setq group nil)))
353 (if group
354 (concat "\\(?:" regexp "\\)")
355 regexp))
356
357
358(defvar rx-parent)
359;; dynamically bound in some functions.
360
361
12c64503
GM
362(defun rx-and (form)
363 "Parse and produce code from FORM.
364FORM is of the form `(and FORM1 ...)'."
365 (rx-check form)
5dbe5c8f
CY
366 (rx-group-if
367 (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
368 (and (memq rx-parent '(* t)) rx-parent)))
12c64503
GM
369
370
371(defun rx-or (form)
372 "Parse and produce code from FORM, which is `(or FORM1 ...)'."
373 (rx-check form)
5dbe5c8f
CY
374 (rx-group-if
375 (if (memq nil (mapcar 'stringp (cdr form)))
376 (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
377 (regexp-opt (cdr form)))
378 (and (memq rx-parent '(: * t)) rx-parent)))
379
380
381(defun rx-anything (form)
382 "Match any character."
383 (if (consp form)
384 (error "rx `anythng' syntax error: %s" form))
385 (rx-or (list 'or 'not-newline ?\n)))
386
387
388(defun rx-any-delete-from-range (char ranges)
389 "Delete by side effect character CHAR from RANGES.
390Only both edges of each range is checked."
391 (let (m)
392 (cond
393 ((memq char ranges) (setq ranges (delq char ranges)))
394 ((setq m (assq char ranges))
395 (if (eq (1+ char) (cdr m))
396 (setcar (memq m ranges) (1+ char))
397 (setcar m (1+ char))))
398 ((setq m (rassq char ranges))
399 (if (eq (1- char) (car m))
400 (setcar (memq m ranges) (1- char))
401 (setcdr m (1- char)))))
402 ranges))
403
404
405(defun rx-any-condense-range (args)
406 "Condense by side effect ARGS as range for Rx `any'."
407 (let (str
408 l)
409 ;; set STR list of all strings
410 ;; set L list of all ranges
411 (mapc (lambda (e) (cond ((stringp e) (push e str))
412 ((numberp e) (push (cons e e) l))
413 (t (push e l))))
414 args)
415 ;; condense overlapped ranges in L
416 (let ((tail (setq l (sort l #'car-less-than-car)))
417 d)
418 (while (setq d (cdr tail))
419 (if (>= (cdar tail) (1- (caar d)))
420 (progn
421 (setcdr (car tail) (max (cdar tail) (cdar d)))
422 (setcdr tail (cdr d)))
423 (setq tail d))))
424 ;; Separate small ranges to single number, and delete dups.
425 (nconc
426 (apply #'nconc
427 (mapcar (lambda (e)
428 (cond
429 ((= (car e) (cdr e)) (list (car e)))
430 ;; ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
431 ((list e))))
432 l))
433 (delete-dups str))))
434
435
436(defun rx-check-any-string (str)
437 "Check string argument STR for Rx `any'."
438 (let ((i 0)
439 c1 c2 l)
440 (if (= 0 (length str))
441 (error "String arg for Rx `any' must not be empty"))
442 (while (string-match ".-." str i)
443 ;; string before range: convert it to characters
444 (if (< i (match-beginning 0))
445 (setq l (nconc
446 l
447 (append (substring str i (match-beginning 0)) nil))))
448 ;; range
449 (setq i (match-end 0)
450 c1 (aref str (match-beginning 0))
451 c2 (aref str (1- i)))
452 (cond
453 ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
454 ((= c1 c2) (setq l (nconc l (list c1))))))
455 ;; rest?
456 (if (< i (length str))
457 (setq l (nconc l (append (substring str i) nil))))
458 l))
12c64503 459
12c64503
GM
460
461(defun rx-check-any (arg)
462 "Check arg ARG for Rx `any'."
5dbe5c8f
CY
463 (cond
464 ((integerp arg) (list arg))
465 ((symbolp arg)
ccfbe679 466 (let ((translation (condition-case nil
5dbe5c8f 467 (rx-form arg)
ccfbe679 468 (error nil))))
5dbe5c8f
CY
469 (if (or (null translation)
470 (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
471 (error "Invalid char class `%s' in Rx `any'" arg))
472 (list (substring translation 1 -1)))) ; strip outer brackets
473 ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
474 (list arg))
475 ((stringp arg) (rx-check-any-string arg))
476 ((error
477 "rx `any' requires string, character, char pair or char class args"))))
478
12c64503
GM
479
480(defun rx-any (form)
ccfbe679
SM
481 "Parse and produce code from FORM, which is `(any ARG ...)'.
482ARG is optional."
12c64503 483 (rx-check form)
5dbe5c8f
CY
484 (let* ((args (rx-any-condense-range
485 (apply
486 #'nconc
487 (mapcar #'rx-check-any (cdr form)))))
488 m
489 s)
490 (cond
491 ;; single close bracket
492 ;; => "[]...-]" or "[]...--.]"
493 ((memq ?\] args)
494 ;; set ] at the beginning
495 (setq args (cons ?\] (delq ?\] args)))
496 ;; set - at the end
497 (if (or (memq ?- args) (assq ?- args))
498 (setq args (nconc (rx-any-delete-from-range ?- args)
499 (list ?-)))))
500 ;; close bracket starts a range
501 ;; => "[]-....-]" or "[]-.--....]"
502 ((setq m (assq ?\] args))
503 ;; bring it to the beginning
504 (setq args (cons m (delq m args)))
505 (cond ((memq ?- args)
506 ;; to the end
507 (setq args (nconc (delq ?- args) (list ?-))))
508 ((setq m (assq ?- args))
509 ;; next to the bracket's range, make the second range
510 (setcdr args (cons m (delq m args))))))
511 ;; bracket in the end range
512 ;; => "[]...-]"
513 ((setq m (rassq ?\] args))
514 ;; set ] at the beginning
515 (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
516 ;; set - at the end
517 (if (or (memq ?- args) (assq ?- args))
518 (setq args (nconc (rx-any-delete-from-range ?- args)
519 (list ?-)))))
520 ;; {no close bracket appears}
521 ;;
522 ;; bring single bar to the beginning
523 ((memq ?- args)
524 (setq args (cons ?- (delq ?- args))))
525 ;; bar start a range, bring it to the beginning
526 ((setq m (assq ?- args))
527 (setq args (cons m (delq m args))))
528 ;;
529 ;; hat at the beginning?
530 ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
531 (setq args (if (cdr args)
532 `(,(cadr args) ,(car args) ,@(cddr args))
533 (nconc (rx-any-delete-from-range ?^ args)
534 (list ?^))))))
535 ;; some 1-char?
536 (if (and (null (cdr args)) (numberp (car args))
537 (or (= 1 (length
538 (setq s (regexp-quote (string (car args))))))
539 (and (equal (car args) ?^) ;; unnecessary predicate?
540 (null (eq rx-parent '!)))))
541 s
542 (concat "["
543 (mapconcat
544 (lambda (e) (cond
545 ((numberp e) (string e))
546 ((consp e)
547 (if (and (= (1+ (car e)) (cdr e))
548 (null (memq (car e) '(?\] ?-))))
549 (string (car e) (cdr e))
550 (string (car e) ?- (cdr e))))
551 (e)))
552 args
553 nil)
554 "]"))))
12c64503
GM
555
556
740b7c2d
EZ
557(defun rx-check-not (arg)
558 "Check arg ARG for Rx `not'."
ccfbe679 559 (unless (or (and (symbolp arg)
5dbe5c8f 560 (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
ccfbe679 561 (condition-case nil
5dbe5c8f 562 (rx-form arg)
ccfbe679 563 (error ""))))
5dbe5c8f 564 (eq arg 'word-boundary)
ccfbe679
SM
565 (and (consp arg)
566 (memq (car arg) '(not any in syntax category))))
567 (error "rx `not' syntax error: %s" arg))
568 t)
12c64503
GM
569
570
571(defun rx-not (form)
572 "Parse and produce code from FORM. FORM is `(not ...)'."
573 (rx-check form)
5dbe5c8f 574 (let ((result (rx-form (cadr form) '!))
062a9fce 575 case-fold-search)
12c64503 576 (cond ((string-match "\\`\\[^" result)
5dbe5c8f
CY
577 (cond
578 ((equal result "[^]") "[^^]")
579 ((and (= (length result) 4) (null (eq rx-parent '!)))
580 (regexp-quote (substring result 2 3)))
581 ((concat "[" (substring result 2)))))
ccfbe679 582 ((eq ?\[ (aref result 0))
12c64503 583 (concat "[^" (substring result 1)))
5dbe5c8f
CY
584 ((string-match "\\`\\\\[scbw]" result)
585 (concat (upcase (substring result 0 2))
586 (substring result 2)))
587 ((string-match "\\`\\\\[SCBW]" result)
588 (concat (downcase (substring result 0 2))
589 (substring result 2)))
12c64503
GM
590 (t
591 (concat "[^" result "]")))))
592
593
ccfbe679
SM
594(defun rx-not-char (form)
595 "Parse and produce code from FORM. FORM is `(not-char ...)'."
596 (rx-check form)
597 (rx-not `(not (in ,@(cdr form)))))
598
599
600(defun rx-not-syntax (form)
601 "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'."
602 (rx-check form)
603 (rx-not `(not (syntax ,@(cdr form)))))
604
605
606(defun rx-trans-forms (form &optional skip)
607 "If FORM's length is greater than two, transform it to length two.
608A form (HEAD REST ...) becomes (HEAD (and REST ...)).
609If SKIP is non-nil, allow that number of items after the head, i.e.
610`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
611 (unless skip (setq skip 0))
612 (let ((tail (nthcdr (1+ skip) form)))
613 (if (= (length tail) 1)
614 form
615 (let ((form (copy-sequence form)))
616 (setcdr (nthcdr skip form) (list (cons 'and tail)))
617 form))))
618
619
620(defun rx-= (form)
621 "Parse and produce code from FORM `(= N ...)'."
622 (rx-check form)
623 (setq form (rx-trans-forms form 1))
624 (unless (and (integerp (nth 1 form))
625 (> (nth 1 form) 0))
626 (error "rx `=' requires positive integer first arg"))
5dbe5c8f 627 (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
ccfbe679
SM
628
629
630(defun rx->= (form)
631 "Parse and produce code from FORM `(>= N ...)'."
632 (rx-check form)
633 (setq form (rx-trans-forms form 1))
634 (unless (and (integerp (nth 1 form))
635 (> (nth 1 form) 0))
636 (error "rx `>=' requires positive integer first arg"))
5dbe5c8f 637 (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
ccfbe679
SM
638
639
640(defun rx-** (form)
641 "Parse and produce code from FORM `(** N M ...)'."
642 (rx-check form)
643 (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
5dbe5c8f 644 (rx-form form '*))
ccfbe679
SM
645
646
12c64503
GM
647(defun rx-repeat (form)
648 "Parse and produce code from FORM.
649FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
650 (rx-check form)
651 (cond ((= (length form) 3)
652 (unless (and (integerp (nth 1 form))
653 (> (nth 1 form) 0))
740b7c2d 654 (error "rx `repeat' requires positive integer first arg"))
5dbe5c8f 655 (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
12c64503
GM
656 ((or (not (integerp (nth 2 form)))
657 (< (nth 2 form) 0)
658 (not (integerp (nth 1 form)))
659 (< (nth 1 form) 0)
660 (< (nth 2 form) (nth 1 form)))
740b7c2d 661 (error "rx `repeat' range error"))
12c64503 662 (t
5dbe5c8f 663 (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
12c64503
GM
664 (nth 1 form) (nth 2 form)))))
665
666
667(defun rx-submatch (form)
668 "Parse and produce code from FORM, which is `(submatch ...)'."
e9e9c7b8
SM
669 (concat "\\("
670 (if (= 2 (length form))
671 ;; Only one sub-form.
672 (rx-form (cadr form))
673 ;; Several sub-forms implicitly concatenated.
674 (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
675 "\\)"))
5dbe5c8f 676
12c64503 677
740b7c2d
EZ
678(defun rx-backref (form)
679 "Parse and produce code from FORM, which is `(backref N)'."
680 (rx-check form)
681 (format "\\%d" (nth 1 form)))
682
683(defun rx-check-backref (arg)
684 "Check arg ARG for Rx `backref'."
685 (or (and (integerp arg) (>= arg 1) (<= arg 9))
686 (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
687
12c64503
GM
688(defun rx-kleene (form)
689 "Parse and produce code from FORM.
690FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
a1506d29 691`zero-or-more' etc. operators.
12c64503
GM
692If OP is one of `*', `+', `?', produce a greedy regexp.
693If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
694If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
695is non-nil."
696 (rx-check form)
ccfbe679 697 (setq form (rx-trans-forms form))
5dbe5c8f 698 (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
12c64503
GM
699 ((memq (car form) '(*? +? ??)) "?")
700 (rx-greedy-flag "")
701 (t "?")))
702 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
703 ((memq (car form) '(+ +? 1+ one-or-more)) "+")
5dbe5c8f
CY
704 (t "?"))))
705 (rx-group-if
706 (concat (rx-form (cadr form) '*) op suffix)
707 (and (memq rx-parent '(t *)) rx-parent))))
c53f9b3b 708
5dbe5c8f
CY
709
710(defun rx-atomic-p (r &optional lax)
c53f9b3b
RS
711 "Return non-nil if regexp string R is atomic.
712An atomic regexp R is one such that a suffix operator
713appended to R will apply to all of R. For example, \"a\"
714\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
715\"[ab]c\", and \"ab\\|ab*c\" are not atomic.
716
717This function may return false negatives, but it will not
718return false positives. It is nevertheless useful in
ab2d877d 719situations where an efficiency shortcut can be taken only if a
c53f9b3b
RS
720regexp is atomic. The function can be improved to detect
721more cases of atomic regexps. Presently, this function
722detects the following categories of atomic regexp;
723
724 a group or shy group: \\(...\\)
725 a character class: [...]
726 a single character: a
727
728On the other hand, false negatives will be returned for
729regexps that are atomic but end in operators, such as
730\"a+\". I think these are rare. Probably such cases could
731be detected without much effort. A guarantee of no false
732negatives would require a theoretic specification of the set
733of all atomic regexps."
734 (let ((l (length r)))
5dbe5c8f
CY
735 (cond
736 ((<= l 1))
737 ((= l 2) (= (aref r 0) ?\\))
738 ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
739 ((null lax)
740 (cond
741 ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r))
742 ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r)))))))
12c64503
GM
743
744
745(defun rx-syntax (form)
746 "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
747 (rx-check form)
09c774f7
SM
748 (let* ((sym (cadr form))
749 (syntax (assq sym rx-syntax)))
12c64503 750 (unless syntax
09c774f7
SM
751 ;; Try sregex compatibility.
752 (let ((name (symbol-name sym)))
753 (if (= 1 (length name))
754 (setq syntax (rassq (aref name 0) rx-syntax))))
755 (unless syntax
756 (error "Unknown rx syntax `%s'" (cadr form))))
12c64503
GM
757 (format "\\s%c" (cdr syntax))))
758
759
760(defun rx-check-category (form)
761 "Check the argument FORM of a `(category FORM)'."
762 (unless (or (integerp form)
763 (cdr (assq form rx-categories)))
764 (error "Unknown category `%s'" form))
765 t)
a1506d29 766
12c64503
GM
767
768(defun rx-category (form)
ccfbe679 769 "Parse and produce code from FORM, which is `(category SYMBOL)'."
12c64503
GM
770 (rx-check form)
771 (let ((char (if (integerp (cadr form))
772 (cadr form)
773 (cdr (assq (cadr form) rx-categories)))))
774 (format "\\c%c" char)))
775
776
777(defun rx-eval (form)
778 "Parse and produce code from FORM, which is `(eval FORM)'."
779 (rx-check form)
5dbe5c8f 780 (rx-form (eval (cadr form)) rx-parent))
12c64503
GM
781
782
783(defun rx-greedy (form)
740b7c2d
EZ
784 "Parse and produce code from FORM.
785If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
786`+', and `?' operators will be used in FORM1. If FORM is
787'(maximal-match FORM1)', greedy operators will be used."
12c64503
GM
788 (rx-check form)
789 (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
5dbe5c8f 790 (rx-form (cadr form) rx-parent)))
12c64503
GM
791
792
793(defun rx-regexp (form)
794 "Parse and produce code from FORM, which is `(regexp STRING)'."
795 (rx-check form)
5dbe5c8f
CY
796 (rx-group-if (cadr form) rx-parent))
797
798
799(defun rx-form (form &optional rx-parent)
800 "Parse and produce code for regular expression FORM.
801FORM is a regular expression in sexp form.
802RX-PARENT shows which type of expression calls and controls putting of
803shy groups around the result and some more in other functions."
804 (if (stringp form)
805 (rx-group-if (regexp-quote form)
806 (if (and (eq rx-parent '*) (< 1 (length form)))
807 rx-parent))
808 (cond ((integerp form)
809 (regexp-quote (char-to-string form)))
810 ((symbolp form)
811 (let ((info (rx-info form)))
812 (cond ((stringp info)
813 info)
814 ((null info)
815 (error "Unknown rx form `%s'" form))
816 (t
817 (funcall (nth 0 info) form)))))
818 ((consp form)
819 (let ((info (rx-info (car form))))
820 (unless (consp info)
821 (error "Unknown rx form `%s'" (car form)))
822 (funcall (nth 0 info) form)))
823 (t
824 (error "rx syntax error at `%s'" form)))))
12c64503
GM
825
826
827;;;###autoload
828(defun rx-to-string (form &optional no-group)
829 "Parse and produce code for regular expression FORM.
830FORM is a regular expression in sexp form.
831NO-GROUP non-nil means don't put shy groups around the result."
5dbe5c8f 832 (rx-group-if (rx-form form) (null no-group)))
12c64503
GM
833
834
835;;;###autoload
ccfbe679
SM
836(defmacro rx (&rest regexps)
837 "Translate regular expressions REGEXPS in sexp form to a regexp string.
838REGEXPS is a non-empty sequence of forms of the sort listed below.
baac7510
CY
839
840Note that `rx' is a Lisp macro; when used in a Lisp program being
841 compiled, the translation is performed by the compiler.
842See `rx-to-string' for how to do such a translation at run-time.
12c64503
GM
843
844The following are valid subforms of regular expressions in sexp
845notation.
846
847STRING
848 matches string STRING literally.
849
850CHAR
851 matches character CHAR literally.
852
ccfbe679 853`not-newline', `nonl'
12c64503 854 matches any character except a newline.
e8449cdb 855
12c64503
GM
856`anything'
857 matches any character
858
ccfbe679
SM
859`(any SET ...)'
860`(in SET ...)'
861`(char SET ...)'
862 matches any character in SET .... SET may be a character or string.
12c64503 863 Ranges of characters can be specified as `A-Z' in strings.
ccfbe679 864 Ranges may also be specified as conses like `(?A . ?Z)'.
12c64503 865
ccfbe679
SM
866 SET may also be the name of a character class: `digit',
867 `control', `hex-digit', `blank', `graph', `print', `alnum',
868 `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
869 `word', or one of their synonyms.
12c64503 870
ccfbe679
SM
871`(not (any SET ...))'
872 matches any character not in SET ...
12c64503 873
ccfbe679 874`line-start', `bol'
12c64503
GM
875 matches the empty string, but only at the beginning of a line
876 in the text being matched
877
ccfbe679 878`line-end', `eol'
12c64503
GM
879 is similar to `line-start' but matches only at the end of a line
880
ccfbe679 881`string-start', `bos', `bot'
12c64503
GM
882 matches the empty string, but only at the beginning of the
883 string being matched against.
884
ccfbe679 885`string-end', `eos', `eot'
12c64503
GM
886 matches the empty string, but only at the end of the
887 string being matched against.
888
889`buffer-start'
890 matches the empty string, but only at the beginning of the
ccfbe679 891 buffer being matched against. Actually equivalent to `string-start'.
12c64503
GM
892
893`buffer-end'
894 matches the empty string, but only at the end of the
ccfbe679 895 buffer being matched against. Actually equivalent to `string-end'.
12c64503
GM
896
897`point'
898 matches the empty string, but only at point.
899
ccfbe679 900`word-start', `bow'
5e3fc9eb 901 matches the empty string, but only at the beginning of a word.
12c64503 902
ccfbe679 903`word-end', `eow'
12c64503
GM
904 matches the empty string, but only at the end of a word.
905
906`word-boundary'
907 matches the empty string, but only at the beginning or end of a
908 word.
909
910`(not word-boundary)'
ccfbe679 911`not-word-boundary'
12c64503
GM
912 matches the empty string, but not at the beginning or end of a
913 word.
914
5e3fc9eb
GM
915`symbol-start'
916 matches the empty string, but only at the beginning of a symbol.
917
918`symbol-end'
919 matches the empty string, but only at the end of a symbol.
920
ccfbe679 921`digit', `numeric', `num'
12c64503
GM
922 matches 0 through 9.
923
ccfbe679 924`control', `cntrl'
12c64503
GM
925 matches ASCII control characters.
926
ccfbe679 927`hex-digit', `hex', `xdigit'
12c64503
GM
928 matches 0 through 9, a through f and A through F.
929
930`blank'
931 matches space and tab only.
932
ccfbe679 933`graphic', `graph'
12c64503
GM
934 matches graphic characters--everything except ASCII control chars,
935 space, and DEL.
936
ccfbe679 937`printing', `print'
12c64503
GM
938 matches printing characters--everything except ASCII control chars
939 and DEL.
940
ccfbe679 941`alphanumeric', `alnum'
12c64503
GM
942 matches letters and digits. (But at present, for multibyte characters,
943 it matches anything that has word syntax.)
944
ccfbe679 945`letter', `alphabetic', `alpha'
12c64503
GM
946 matches letters. (But at present, for multibyte characters,
947 it matches anything that has word syntax.)
948
949`ascii'
950 matches ASCII (unibyte) characters.
951
952`nonascii'
953 matches non-ASCII (multibyte) characters.
954
ccfbe679 955`lower', `lower-case'
12c64503
GM
956 matches anything lower-case.
957
ccfbe679 958`upper', `upper-case'
12c64503
GM
959 matches anything upper-case.
960
ccfbe679 961`punctuation', `punct'
12c64503
GM
962 matches punctuation. (But at present, for multibyte characters,
963 it matches anything that has non-word syntax.)
964
ccfbe679 965`space', `whitespace', `white'
12c64503
GM
966 matches anything that has whitespace syntax.
967
ccfbe679 968`word', `wordchar'
12c64503
GM
969 matches anything that has word syntax.
970
ccfbe679
SM
971`not-wordchar'
972 matches anything that has non-word syntax.
973
12c64503
GM
974`(syntax SYNTAX)'
975 matches a character with syntax SYNTAX. SYNTAX must be one
ccfbe679
SM
976 of the following symbols, or a symbol corresponding to the syntax
977 character, e.g. `\\.' for `\\s.'.
12c64503
GM
978
979 `whitespace' (\\s- in string notation)
980 `punctuation' (\\s.)
981 `word' (\\sw)
982 `symbol' (\\s_)
983 `open-parenthesis' (\\s()
984 `close-parenthesis' (\\s))
985 `expression-prefix' (\\s')
986 `string-quote' (\\s\")
987 `paired-delimiter' (\\s$)
988 `escape' (\\s\\)
989 `character-quote' (\\s/)
990 `comment-start' (\\s<)
991 `comment-end' (\\s>)
740b7c2d
EZ
992 `string-delimiter' (\\s|)
993 `comment-delimiter' (\\s!)
12c64503
GM
994
995`(not (syntax SYNTAX))'
ccfbe679 996 matches a character that doesn't have syntax SYNTAX.
12c64503
GM
997
998`(category CATEGORY)'
999 matches a character with category CATEGORY. CATEGORY must be
1000 either a character to use for C, or one of the following symbols.
1001
1002 `consonant' (\\c0 in string notation)
1003 `base-vowel' (\\c1)
1004 `upper-diacritical-mark' (\\c2)
1005 `lower-diacritical-mark' (\\c3)
1006 `tone-mark' (\\c4)
1007 `symbol' (\\c5)
1008 `digit' (\\c6)
1009 `vowel-modifying-diacritical-mark' (\\c7)
1010 `vowel-sign' (\\c8)
1011 `semivowel-lower' (\\c9)
1012 `not-at-end-of-line' (\\c<)
1013 `not-at-beginning-of-line' (\\c>)
1014 `alpha-numeric-two-byte' (\\cA)
1015 `chinse-two-byte' (\\cC)
1016 `greek-two-byte' (\\cG)
1017 `japanese-hiragana-two-byte' (\\cH)
1018 `indian-tow-byte' (\\cI)
1019 `japanese-katakana-two-byte' (\\cK)
1020 `korean-hangul-two-byte' (\\cN)
1021 `cyrillic-two-byte' (\\cY)
ccfbe679 1022 `combining-diacritic' (\\c^)
12c64503
GM
1023 `ascii' (\\ca)
1024 `arabic' (\\cb)
1025 `chinese' (\\cc)
1026 `ethiopic' (\\ce)
1027 `greek' (\\cg)
1028 `korean' (\\ch)
1029 `indian' (\\ci)
1030 `japanese' (\\cj)
1031 `japanese-katakana' (\\ck)
1032 `latin' (\\cl)
1033 `lao' (\\co)
1034 `tibetan' (\\cq)
1035 `japanese-roman' (\\cr)
1036 `thai' (\\ct)
1037 `vietnamese' (\\cv)
1038 `hebrew' (\\cw)
1039 `cyrillic' (\\cy)
1040 `can-break' (\\c|)
1041
1042`(not (category CATEGORY))'
ccfbe679 1043 matches a character that doesn't have category CATEGORY.
12c64503
GM
1044
1045`(and SEXP1 SEXP2 ...)'
ccfbe679
SM
1046`(: SEXP1 SEXP2 ...)'
1047`(seq SEXP1 SEXP2 ...)'
1048`(sequence SEXP1 SEXP2 ...)'
12c64503
GM
1049 matches what SEXP1 matches, followed by what SEXP2 matches, etc.
1050
1051`(submatch SEXP1 SEXP2 ...)'
ccfbe679 1052`(group SEXP1 SEXP2 ...)'
12c64503
GM
1053 like `and', but makes the match accessible with `match-end',
1054 `match-beginning', and `match-string'.
1055
5dbe5c8f
CY
1056`(group SEXP1 SEXP2 ...)'
1057 another name for `submatch'.
1058
12c64503 1059`(or SEXP1 SEXP2 ...)'
ccfbe679 1060`(| SEXP1 SEXP2 ...)'
12c64503
GM
1061 matches anything that matches SEXP1 or SEXP2, etc. If all
1062 args are strings, use `regexp-opt' to optimize the resulting
1063 regular expression.
1064
1065`(minimal-match SEXP)'
1066 produce a non-greedy regexp for SEXP. Normally, regexps matching
740b7c2d 1067 zero or more occurrences of something are \"greedy\" in that they
12c64503
GM
1068 match as much as they can, as long as the overall regexp can
1069 still match. A non-greedy regexp matches as little as possible.
1070
1071`(maximal-match SEXP)'
0a6cac62 1072 produce a greedy regexp for SEXP. This is the default.
12c64503 1073
ccfbe679
SM
1074Below, `SEXP ...' represents a sequence of regexp forms, treated as if
1075enclosed in `(and ...)'.
12c64503 1076
ccfbe679
SM
1077`(zero-or-more SEXP ...)'
1078`(0+ SEXP ...)'
1079 matches zero or more occurrences of what SEXP ... matches.
12c64503 1080
ccfbe679
SM
1081`(* SEXP ...)'
1082 like `zero-or-more', but always produces a greedy regexp, independent
1083 of `rx-greedy-flag'.
12c64503 1084
ccfbe679
SM
1085`(*? SEXP ...)'
1086 like `zero-or-more', but always produces a non-greedy regexp,
1087 independent of `rx-greedy-flag'.
a1506d29 1088
ccfbe679
SM
1089`(one-or-more SEXP ...)'
1090`(1+ SEXP ...)'
1091 matches one or more occurrences of SEXP ...
12c64503 1092
ccfbe679 1093`(+ SEXP ...)'
12c64503
GM
1094 like `one-or-more', but always produces a greedy regexp.
1095
ccfbe679 1096`(+? SEXP ...)'
12c64503
GM
1097 like `one-or-more', but always produces a non-greedy regexp.
1098
ccfbe679
SM
1099`(zero-or-one SEXP ...)'
1100`(optional SEXP ...)'
1101`(opt SEXP ...)'
12c64503 1102 matches zero or one occurrences of A.
a1506d29 1103
ccfbe679 1104`(? SEXP ...)'
12c64503
GM
1105 like `zero-or-one', but always produces a greedy regexp.
1106
ccfbe679 1107`(?? SEXP ...)'
12c64503
GM
1108 like `zero-or-one', but always produces a non-greedy regexp.
1109
1110`(repeat N SEXP)'
ccfbe679
SM
1111`(= N SEXP ...)'
1112 matches N occurrences.
1113
1114`(>= N SEXP ...)'
1115 matches N or more occurrences.
12c64503
GM
1116
1117`(repeat N M SEXP)'
ccfbe679
SM
1118`(** N M SEXP ...)'
1119 matches N to M occurrences.
1120
942269e7
JB
1121`(backref N)'
1122 matches what was matched previously by submatch N.
1123
12c64503 1124`(eval FORM)'
942269e7
JB
1125 evaluate FORM and insert result. If result is a string,
1126 `regexp-quote' it.
12c64503
GM
1127
1128`(regexp REGEXP)'
942269e7 1129 include REGEXP in string notation in the result."
ccfbe679
SM
1130 (cond ((null regexps)
1131 (error "No regexp"))
1132 ((cdr regexps)
1133 (rx-to-string `(and ,@regexps) t))
1134 (t
1135 (rx-to-string (car regexps) t))))
1136\f
1137;; ;; sregex.el replacement
1138
1139;; ;;;###autoload (provide 'sregex)
1140;; ;;;###autoload (autoload 'sregex "rx")
1141;; (defalias 'sregex 'rx-to-string)
1142;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
1143;; (defalias 'sregexq 'rx)
1144\f
12c64503
GM
1145(provide 'rx)
1146
b62c13c2 1147;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
12c64503 1148;;; rx.el ends here