* lisp/files.el (safe-local-variable-p): Gracefully handle errors.
[bpt/emacs.git] / lisp / emacs-lisp / rx.el
CommitLineData
12c64503
GM
1;;; rx.el --- sexp notation for regular expressions
2
ceb4c4d3 3;; Copyright (C) 2001, 2002, 2003, 2004, 2005,
114f9c96 4;; 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
12c64503
GM
5
6;; Author: Gerd Moellmann <gerd@gnu.org>
7;; Maintainer: FSF
8;; Keywords: strings, regexps, extensions
9
10;; This file is part of GNU Emacs.
11
d6cba7ae 12;; GNU Emacs is free software: you can redistribute it and/or modify
12c64503 13;; it under the terms of the GNU General Public License as published by
d6cba7ae
GM
14;; the Free Software Foundation, either version 3 of the License, or
15;; (at your option) any later version.
12c64503
GM
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
d6cba7ae 23;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
12c64503
GM
24
25;;; Commentary:
26
27;; This is another implementation of sexp-form regular expressions.
28;; It was unfortunately written without being aware of the Sregex
29;; package coming with Emacs, but as things stand, Rx completely
30;; covers all regexp features, which Sregex doesn't, doesn't suffer
31;; from the bugs mentioned in the commentary section of Sregex, and
32;; uses a nicer syntax (IMHO, of course :-).
33
ccfbe679
SM
34;; This significantly extended version of the original, is almost
35;; compatible with Sregex. The only incompatibility I (fx) know of is
36;; that the `repeat' form can't have multiple regexp args.
37
38;; Now alternative forms are provided for a degree of compatibility
39;; with Shivers' attempted definitive SRE notation
40;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not
41;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
42;; ,<exp>, (word ...), word+, posix-string, and character class forms.
43;; Some forms are inconsistent with SRE, either for historical reasons
44;; or because of the implementation -- simple translation into Emacs
45;; regexp strings. These include: any, word. Also, case-sensitivity
46;; and greediness are controlled by variables external to the regexp,
47;; and you need to feed the forms to the `posix-' functions to get
48;; SRE's POSIX semantics. There are probably more difficulties.
49
12c64503
GM
50;; Rx translates a sexp notation for regular expressions into the
51;; usual string notation. The translation can be done at compile-time
52;; by using the `rx' macro. It can be done at run-time by calling
53;; function `rx-to-string'. See the documentation of `rx' for a
54;; complete description of the sexp notation.
55;;
56;; Some examples of string regexps and their sexp counterparts:
57;;
58;; "^[a-z]*"
59;; (rx (and line-start (0+ (in "a-z"))))
60;;
61;; "\n[^ \t]"
62;; (rx (and "\n" (not blank))), or
63;; (rx (and "\n" (not (any " \t"))))
64;;
65;; "\\*\\*\\* EOOH \\*\\*\\*\n"
66;; (rx "*** EOOH ***\n")
67;;
68;; "\\<\\(catch\\|finally\\)\\>[^_]"
69;; (rx (and word-start (submatch (or "catch" "finally")) word-end
70;; (not (any ?_))))
71;;
72;; "[ \t\n]*:\\([^:]+\\|$\\)"
73;; (rx (and (zero-or-more (in " \t\n")) ":"
74;; (submatch (or line-end (one-or-more (not (any ?:)))))))
75;;
76;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
77;; (rx (and line-start
78;; "content-transfer-encoding:"
c53f9b3b 79;; (+ (? ?\n)) blank
12c64503 80;; "quoted-printable"
c53f9b3b 81;; (+ (? ?\n)) blank))
12c64503
GM
82;;
83;; (concat "^\\(?:" something-else "\\)")
84;; (rx (and line-start (eval something-else))), statically or
85;; (rx-to-string '(and line-start ,something-else)), dynamically.
86;;
87;; (regexp-opt '(STRING1 STRING2 ...))
88;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
89;; calls `regexp-opt' as needed.
90;;
91;; "^;;\\s-*\n\\|^\n"
92;; (rx (or (and line-start ";;" (0+ space) ?\n)
93;; (and line-start ?\n)))
94;;
95;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
a1506d29
JB
96;; (rx (and "$Id: "
97;; (1+ (not (in " ")))
12c64503
GM
98;; " "
99;; (submatch (1+ (not (in " "))))
c53f9b3b 100;; " "))
12c64503
GM
101;;
102;; "\\\\\\\\\\[\\w+"
103;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
104;;
105;; etc.
106
107;;; History:
a1506d29 108;;
12c64503
GM
109
110;;; Code:
111
12c64503
GM
112(defconst rx-constituents
113 '((and . (rx-and 1 nil))
ccfbe679
SM
114 (seq . and) ; SRE
115 (: . and) ; SRE
116 (sequence . and) ; sregex
12c64503 117 (or . (rx-or 1 nil))
ccfbe679 118 (| . or) ; SRE
12c64503 119 (not-newline . ".")
ccfbe679 120 (nonl . not-newline) ; SRE
5dbe5c8f 121 (anything . (rx-anything 0 nil))
ccfbe679 122 (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
12c64503 123 (in . any)
ccfbe679
SM
124 (char . any) ; sregex
125 (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex
12c64503 126 (not . (rx-not 1 1 rx-check-not))
ccfbe679
SM
127 ;; Partially consistent with sregex, whose `repeat' is like our
128 ;; `**'. (`repeat' with optional max arg and multiple sexp forms
129 ;; is ambiguous.)
12c64503 130 (repeat . (rx-repeat 2 3))
ccfbe679
SM
131 (= . (rx-= 2 nil)) ; SRE
132 (>= . (rx->= 2 nil)) ; SRE
133 (** . (rx-** 2 nil)) ; SRE
134 (submatch . (rx-submatch 1 nil)) ; SRE
12c64503 135 (group . submatch)
ccfbe679
SM
136 (zero-or-more . (rx-kleene 1 nil))
137 (one-or-more . (rx-kleene 1 nil))
138 (zero-or-one . (rx-kleene 1 nil))
139 (\? . zero-or-one) ; SRE
12c64503 140 (\?? . zero-or-one)
ccfbe679 141 (* . zero-or-more) ; SRE
12c64503
GM
142 (*? . zero-or-more)
143 (0+ . zero-or-more)
ccfbe679 144 (+ . one-or-more) ; SRE
12c64503
GM
145 (+? . one-or-more)
146 (1+ . one-or-more)
147 (optional . zero-or-one)
ccfbe679 148 (opt . zero-or-one) ; sregex
12c64503
GM
149 (minimal-match . (rx-greedy 1 1))
150 (maximal-match . (rx-greedy 1 1))
740b7c2d 151 (backref . (rx-backref 1 1 rx-check-backref))
12c64503 152 (line-start . "^")
ccfbe679 153 (bol . line-start) ; SRE
12c64503 154 (line-end . "$")
ccfbe679 155 (eol . line-end) ; SRE
12c64503 156 (string-start . "\\`")
ccfbe679
SM
157 (bos . string-start) ; SRE
158 (bot . string-start) ; sregex
12c64503 159 (string-end . "\\'")
ccfbe679
SM
160 (eos . string-end) ; SRE
161 (eot . string-end) ; sregex
12c64503
GM
162 (buffer-start . "\\`")
163 (buffer-end . "\\'")
164 (point . "\\=")
165 (word-start . "\\<")
ccfbe679 166 (bow . word-start) ; SRE
12c64503 167 (word-end . "\\>")
ccfbe679 168 (eow . word-end) ; SRE
12c64503 169 (word-boundary . "\\b")
ccfbe679 170 (not-word-boundary . "\\B") ; sregex
b62c13c2
SM
171 (symbol-start . "\\_<")
172 (symbol-end . "\\_>")
12c64503 173 (syntax . (rx-syntax 1 1))
ccfbe679 174 (not-syntax . (rx-not-syntax 1 1)) ; sregex
12c64503
GM
175 (category . (rx-category 1 1 rx-check-category))
176 (eval . (rx-eval 1 1))
177 (regexp . (rx-regexp 1 1 stringp))
178 (digit . "[[:digit:]]")
ccfbe679
SM
179 (numeric . digit) ; SRE
180 (num . digit) ; SRE
181 (control . "[[:cntrl:]]") ; SRE
182 (cntrl . control) ; SRE
183 (hex-digit . "[[:xdigit:]]") ; SRE
184 (hex . hex-digit) ; SRE
185 (xdigit . hex-digit) ; SRE
186 (blank . "[[:blank:]]") ; SRE
187 (graphic . "[[:graph:]]") ; SRE
188 (graph . graphic) ; SRE
189 (printing . "[[:print:]]") ; SRE
190 (print . printing) ; SRE
191 (alphanumeric . "[[:alnum:]]") ; SRE
192 (alnum . alphanumeric) ; SRE
12c64503 193 (letter . "[[:alpha:]]")
ccfbe679
SM
194 (alphabetic . letter) ; SRE
195 (alpha . letter) ; SRE
196 (ascii . "[[:ascii:]]") ; SRE
12c64503 197 (nonascii . "[[:nonascii:]]")
ccfbe679
SM
198 (lower . "[[:lower:]]") ; SRE
199 (lower-case . lower) ; SRE
200 (punctuation . "[[:punct:]]") ; SRE
201 (punct . punctuation) ; SRE
202 (space . "[[:space:]]") ; SRE
203 (whitespace . space) ; SRE
204 (white . space) ; SRE
205 (upper . "[[:upper:]]") ; SRE
206 (upper-case . upper) ; SRE
207 (word . "[[:word:]]") ; inconsistent with SRE
208 (wordchar . word) ; sregex
5dbe5c8f 209 (not-wordchar . "\\W"))
12c64503
GM
210 "Alist of sexp form regexp constituents.
211Each element of the alist has the form (SYMBOL . DEFN).
212SYMBOL is a valid constituent of sexp regular expressions.
213If DEFN is a string, SYMBOL is translated into DEFN.
214If DEFN is a symbol, use the definition of DEFN, recursively.
215Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
216FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS
217are the minimum and maximum number of arguments the function-form
218sexp constituent SYMBOL may have in sexp regular expressions.
219MAX-ARGS nil means no limit. PREDICATE, if specified, means that
220all arguments must satisfy PREDICATE.")
221
222
223(defconst rx-syntax
224 '((whitespace . ?-)
225 (punctuation . ?.)
226 (word . ?w)
227 (symbol . ?_)
228 (open-parenthesis . ?\()
229 (close-parenthesis . ?\))
230 (expression-prefix . ?\')
231 (string-quote . ?\")
232 (paired-delimiter . ?$)
233 (escape . ?\\)
234 (character-quote . ?/)
235 (comment-start . ?<)
740b7c2d
EZ
236 (comment-end . ?>)
237 (string-delimiter . ?|)
09c774f7 238 (comment-delimiter . ?!))
12c64503
GM
239 "Alist mapping Rx syntax symbols to syntax characters.
240Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
241symbol in `(syntax SYMBOL)', and CHAR is the syntax character
242corresponding to SYMBOL, as it would be used with \\s or \\S in
243regular expressions.")
244
245
246(defconst rx-categories
247 '((consonant . ?0)
248 (base-vowel . ?1)
249 (upper-diacritical-mark . ?2)
250 (lower-diacritical-mark . ?3)
251 (tone-mark . ?4)
252 (symbol . ?5)
253 (digit . ?6)
254 (vowel-modifying-diacritical-mark . ?7)
255 (vowel-sign . ?8)
256 (semivowel-lower . ?9)
257 (not-at-end-of-line . ?<)
258 (not-at-beginning-of-line . ?>)
259 (alpha-numeric-two-byte . ?A)
260 (chinse-two-byte . ?C)
261 (greek-two-byte . ?G)
262 (japanese-hiragana-two-byte . ?H)
263 (indian-two-byte . ?I)
264 (japanese-katakana-two-byte . ?K)
265 (korean-hangul-two-byte . ?N)
266 (cyrillic-two-byte . ?Y)
740b7c2d 267 (combining-diacritic . ?^)
12c64503
GM
268 (ascii . ?a)
269 (arabic . ?b)
270 (chinese . ?c)
271 (ethiopic . ?e)
272 (greek . ?g)
273 (korean . ?h)
274 (indian . ?i)
275 (japanese . ?j)
276 (japanese-katakana . ?k)
277 (latin . ?l)
278 (lao . ?o)
279 (tibetan . ?q)
280 (japanese-roman . ?r)
281 (thai . ?t)
282 (vietnamese . ?v)
283 (hebrew . ?w)
284 (cyrillic . ?y)
285 (can-break . ?|))
286 "Alist mapping symbols to category characters.
287Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
288symbol in `(category SYMBOL)', and CHAR is the category character
289corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
290regular expression strings.")
291
292
293(defvar rx-greedy-flag t
294 "Non-nil means produce greedy regular expressions for `zero-or-one',
295`zero-or-more', and `one-or-more'. Dynamically bound.")
296
297
298(defun rx-info (op)
299 "Return parsing/code generation info for OP.
300If OP is the space character ASCII 32, return info for the symbol `?'.
301If OP is the character `?', return info for the symbol `??'.
302See also `rx-constituents'."
303 (cond ((eq op ? ) (setq op '\?))
304 ((eq op ??) (setq op '\??)))
305 (while (and (not (null op)) (symbolp op))
306 (setq op (cdr (assq op rx-constituents))))
307 op)
a1506d29 308
12c64503
GM
309
310(defun rx-check (form)
311 "Check FORM according to its car's parsing info."
ccfbe679
SM
312 (unless (listp form)
313 (error "rx `%s' needs argument(s)" form))
12c64503
GM
314 (let* ((rx (rx-info (car form)))
315 (nargs (1- (length form)))
316 (min-args (nth 1 rx))
317 (max-args (nth 2 rx))
318 (type-pred (nth 3 rx)))
319 (when (and (not (null min-args))
320 (< nargs min-args))
740b7c2d 321 (error "rx form `%s' requires at least %d args"
12c64503
GM
322 (car form) min-args))
323 (when (and (not (null max-args))
324 (> nargs max-args))
740b7c2d 325 (error "rx form `%s' accepts at most %d args"
12c64503
GM
326 (car form) max-args))
327 (when (not (null type-pred))
328 (dolist (sub-form (cdr form))
329 (unless (funcall type-pred sub-form)
740b7c2d 330 (error "rx form `%s' requires args satisfying `%s'"
12c64503
GM
331 (car form) type-pred))))))
332
333
5dbe5c8f
CY
334(defun rx-group-if (regexp group)
335 "Put shy groups around REGEXP if seemingly necessary when GROUP
336is non-nil."
337 (cond
338 ;; for some repetition
339 ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
340 ;; for concatenation
341 ((eq group ':)
342 (if (rx-atomic-p
343 (if (string-match
344 "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
345 (substring regexp 0 (match-beginning 0))
346 regexp))
347 (setq group nil)))
348 ;; for OR
349 ((eq group '|) (setq group nil))
350 ;; do anyway
351 ((eq group t))
352 ((rx-atomic-p regexp t) (setq group nil)))
353 (if group
354 (concat "\\(?:" regexp "\\)")
355 regexp))
356
357
358(defvar rx-parent)
359;; dynamically bound in some functions.
360
361
12c64503
GM
362(defun rx-and (form)
363 "Parse and produce code from FORM.
364FORM is of the form `(and FORM1 ...)'."
365 (rx-check form)
5dbe5c8f
CY
366 (rx-group-if
367 (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
368 (and (memq rx-parent '(* t)) rx-parent)))
12c64503
GM
369
370
371(defun rx-or (form)
372 "Parse and produce code from FORM, which is `(or FORM1 ...)'."
373 (rx-check form)
5dbe5c8f
CY
374 (rx-group-if
375 (if (memq nil (mapcar 'stringp (cdr form)))
376 (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
377 (regexp-opt (cdr form)))
378 (and (memq rx-parent '(: * t)) rx-parent)))
379
380
381(defun rx-anything (form)
382 "Match any character."
383 (if (consp form)
384 (error "rx `anythng' syntax error: %s" form))
385 (rx-or (list 'or 'not-newline ?\n)))
386
387
388(defun rx-any-delete-from-range (char ranges)
389 "Delete by side effect character CHAR from RANGES.
390Only both edges of each range is checked."
391 (let (m)
392 (cond
393 ((memq char ranges) (setq ranges (delq char ranges)))
394 ((setq m (assq char ranges))
395 (if (eq (1+ char) (cdr m))
396 (setcar (memq m ranges) (1+ char))
397 (setcar m (1+ char))))
398 ((setq m (rassq char ranges))
399 (if (eq (1- char) (car m))
400 (setcar (memq m ranges) (1- char))
401 (setcdr m (1- char)))))
402 ranges))
403
404
405(defun rx-any-condense-range (args)
406 "Condense by side effect ARGS as range for Rx `any'."
407 (let (str
408 l)
409 ;; set STR list of all strings
410 ;; set L list of all ranges
411 (mapc (lambda (e) (cond ((stringp e) (push e str))
412 ((numberp e) (push (cons e e) l))
413 (t (push e l))))
414 args)
415 ;; condense overlapped ranges in L
416 (let ((tail (setq l (sort l #'car-less-than-car)))
417 d)
418 (while (setq d (cdr tail))
419 (if (>= (cdar tail) (1- (caar d)))
420 (progn
421 (setcdr (car tail) (max (cdar tail) (cdar d)))
422 (setcdr tail (cdr d)))
423 (setq tail d))))
424 ;; Separate small ranges to single number, and delete dups.
425 (nconc
426 (apply #'nconc
427 (mapcar (lambda (e)
428 (cond
429 ((= (car e) (cdr e)) (list (car e)))
19fe13df 430 ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
5dbe5c8f
CY
431 ((list e))))
432 l))
433 (delete-dups str))))
434
435
436(defun rx-check-any-string (str)
437 "Check string argument STR for Rx `any'."
438 (let ((i 0)
439 c1 c2 l)
440 (if (= 0 (length str))
441 (error "String arg for Rx `any' must not be empty"))
442 (while (string-match ".-." str i)
443 ;; string before range: convert it to characters
444 (if (< i (match-beginning 0))
445 (setq l (nconc
446 l
447 (append (substring str i (match-beginning 0)) nil))))
448 ;; range
449 (setq i (match-end 0)
450 c1 (aref str (match-beginning 0))
451 c2 (aref str (1- i)))
452 (cond
453 ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
454 ((= c1 c2) (setq l (nconc l (list c1))))))
455 ;; rest?
456 (if (< i (length str))
457 (setq l (nconc l (append (substring str i) nil))))
458 l))
12c64503 459
12c64503
GM
460
461(defun rx-check-any (arg)
462 "Check arg ARG for Rx `any'."
5dbe5c8f
CY
463 (cond
464 ((integerp arg) (list arg))
465 ((symbolp arg)
ccfbe679 466 (let ((translation (condition-case nil
5dbe5c8f 467 (rx-form arg)
ccfbe679 468 (error nil))))
5dbe5c8f
CY
469 (if (or (null translation)
470 (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
471 (error "Invalid char class `%s' in Rx `any'" arg))
472 (list (substring translation 1 -1)))) ; strip outer brackets
473 ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
474 (list arg))
475 ((stringp arg) (rx-check-any-string arg))
476 ((error
477 "rx `any' requires string, character, char pair or char class args"))))
478
12c64503
GM
479
480(defun rx-any (form)
ccfbe679
SM
481 "Parse and produce code from FORM, which is `(any ARG ...)'.
482ARG is optional."
12c64503 483 (rx-check form)
5dbe5c8f
CY
484 (let* ((args (rx-any-condense-range
485 (apply
486 #'nconc
487 (mapcar #'rx-check-any (cdr form)))))
488 m
489 s)
490 (cond
491 ;; single close bracket
492 ;; => "[]...-]" or "[]...--.]"
493 ((memq ?\] args)
494 ;; set ] at the beginning
495 (setq args (cons ?\] (delq ?\] args)))
496 ;; set - at the end
497 (if (or (memq ?- args) (assq ?- args))
498 (setq args (nconc (rx-any-delete-from-range ?- args)
499 (list ?-)))))
500 ;; close bracket starts a range
501 ;; => "[]-....-]" or "[]-.--....]"
502 ((setq m (assq ?\] args))
503 ;; bring it to the beginning
504 (setq args (cons m (delq m args)))
505 (cond ((memq ?- args)
506 ;; to the end
507 (setq args (nconc (delq ?- args) (list ?-))))
508 ((setq m (assq ?- args))
509 ;; next to the bracket's range, make the second range
510 (setcdr args (cons m (delq m args))))))
511 ;; bracket in the end range
512 ;; => "[]...-]"
513 ((setq m (rassq ?\] args))
514 ;; set ] at the beginning
515 (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
516 ;; set - at the end
517 (if (or (memq ?- args) (assq ?- args))
518 (setq args (nconc (rx-any-delete-from-range ?- args)
519 (list ?-)))))
520 ;; {no close bracket appears}
521 ;;
522 ;; bring single bar to the beginning
523 ((memq ?- args)
524 (setq args (cons ?- (delq ?- args))))
525 ;; bar start a range, bring it to the beginning
526 ((setq m (assq ?- args))
527 (setq args (cons m (delq m args))))
528 ;;
529 ;; hat at the beginning?
530 ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
531 (setq args (if (cdr args)
532 `(,(cadr args) ,(car args) ,@(cddr args))
533 (nconc (rx-any-delete-from-range ?^ args)
534 (list ?^))))))
535 ;; some 1-char?
536 (if (and (null (cdr args)) (numberp (car args))
537 (or (= 1 (length
538 (setq s (regexp-quote (string (car args))))))
539 (and (equal (car args) ?^) ;; unnecessary predicate?
540 (null (eq rx-parent '!)))))
541 s
542 (concat "["
543 (mapconcat
544 (lambda (e) (cond
545 ((numberp e) (string e))
546 ((consp e)
547 (if (and (= (1+ (car e)) (cdr e))
19fe13df
SM
548 ;; rx-any-condense-range should
549 ;; prevent this case from happening.
550 (null (memq (car e) '(?\] ?-)))
551 (null (memq (cdr e) '(?\] ?-))))
5dbe5c8f
CY
552 (string (car e) (cdr e))
553 (string (car e) ?- (cdr e))))
554 (e)))
555 args
556 nil)
557 "]"))))
12c64503
GM
558
559
740b7c2d
EZ
560(defun rx-check-not (arg)
561 "Check arg ARG for Rx `not'."
ccfbe679 562 (unless (or (and (symbolp arg)
5dbe5c8f 563 (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
ccfbe679 564 (condition-case nil
5dbe5c8f 565 (rx-form arg)
ccfbe679 566 (error ""))))
5dbe5c8f 567 (eq arg 'word-boundary)
ccfbe679
SM
568 (and (consp arg)
569 (memq (car arg) '(not any in syntax category))))
570 (error "rx `not' syntax error: %s" arg))
571 t)
12c64503
GM
572
573
574(defun rx-not (form)
575 "Parse and produce code from FORM. FORM is `(not ...)'."
576 (rx-check form)
5dbe5c8f 577 (let ((result (rx-form (cadr form) '!))
062a9fce 578 case-fold-search)
12c64503 579 (cond ((string-match "\\`\\[^" result)
5dbe5c8f
CY
580 (cond
581 ((equal result "[^]") "[^^]")
582 ((and (= (length result) 4) (null (eq rx-parent '!)))
583 (regexp-quote (substring result 2 3)))
584 ((concat "[" (substring result 2)))))
ccfbe679 585 ((eq ?\[ (aref result 0))
12c64503 586 (concat "[^" (substring result 1)))
5dbe5c8f
CY
587 ((string-match "\\`\\\\[scbw]" result)
588 (concat (upcase (substring result 0 2))
589 (substring result 2)))
590 ((string-match "\\`\\\\[SCBW]" result)
591 (concat (downcase (substring result 0 2))
592 (substring result 2)))
12c64503
GM
593 (t
594 (concat "[^" result "]")))))
595
596
ccfbe679
SM
597(defun rx-not-char (form)
598 "Parse and produce code from FORM. FORM is `(not-char ...)'."
599 (rx-check form)
600 (rx-not `(not (in ,@(cdr form)))))
601
602
603(defun rx-not-syntax (form)
604 "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'."
605 (rx-check form)
606 (rx-not `(not (syntax ,@(cdr form)))))
607
608
609(defun rx-trans-forms (form &optional skip)
610 "If FORM's length is greater than two, transform it to length two.
611A form (HEAD REST ...) becomes (HEAD (and REST ...)).
612If SKIP is non-nil, allow that number of items after the head, i.e.
613`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
614 (unless skip (setq skip 0))
615 (let ((tail (nthcdr (1+ skip) form)))
616 (if (= (length tail) 1)
617 form
618 (let ((form (copy-sequence form)))
619 (setcdr (nthcdr skip form) (list (cons 'and tail)))
620 form))))
621
622
623(defun rx-= (form)
624 "Parse and produce code from FORM `(= N ...)'."
625 (rx-check form)
626 (setq form (rx-trans-forms form 1))
627 (unless (and (integerp (nth 1 form))
628 (> (nth 1 form) 0))
629 (error "rx `=' requires positive integer first arg"))
5dbe5c8f 630 (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
ccfbe679
SM
631
632
633(defun rx->= (form)
634 "Parse and produce code from FORM `(>= N ...)'."
635 (rx-check form)
636 (setq form (rx-trans-forms form 1))
637 (unless (and (integerp (nth 1 form))
638 (> (nth 1 form) 0))
639 (error "rx `>=' requires positive integer first arg"))
5dbe5c8f 640 (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
ccfbe679
SM
641
642
643(defun rx-** (form)
644 "Parse and produce code from FORM `(** N M ...)'."
645 (rx-check form)
646 (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
5dbe5c8f 647 (rx-form form '*))
ccfbe679
SM
648
649
12c64503
GM
650(defun rx-repeat (form)
651 "Parse and produce code from FORM.
652FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
653 (rx-check form)
654 (cond ((= (length form) 3)
655 (unless (and (integerp (nth 1 form))
656 (> (nth 1 form) 0))
740b7c2d 657 (error "rx `repeat' requires positive integer first arg"))
5dbe5c8f 658 (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
12c64503
GM
659 ((or (not (integerp (nth 2 form)))
660 (< (nth 2 form) 0)
661 (not (integerp (nth 1 form)))
662 (< (nth 1 form) 0)
663 (< (nth 2 form) (nth 1 form)))
740b7c2d 664 (error "rx `repeat' range error"))
12c64503 665 (t
5dbe5c8f 666 (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
12c64503
GM
667 (nth 1 form) (nth 2 form)))))
668
669
670(defun rx-submatch (form)
671 "Parse and produce code from FORM, which is `(submatch ...)'."
e9e9c7b8
SM
672 (concat "\\("
673 (if (= 2 (length form))
674 ;; Only one sub-form.
675 (rx-form (cadr form))
676 ;; Several sub-forms implicitly concatenated.
677 (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
678 "\\)"))
5dbe5c8f 679
12c64503 680
740b7c2d
EZ
681(defun rx-backref (form)
682 "Parse and produce code from FORM, which is `(backref N)'."
683 (rx-check form)
684 (format "\\%d" (nth 1 form)))
685
686(defun rx-check-backref (arg)
687 "Check arg ARG for Rx `backref'."
688 (or (and (integerp arg) (>= arg 1) (<= arg 9))
689 (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
690
12c64503
GM
691(defun rx-kleene (form)
692 "Parse and produce code from FORM.
693FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
a1506d29 694`zero-or-more' etc. operators.
12c64503
GM
695If OP is one of `*', `+', `?', produce a greedy regexp.
696If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
697If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
698is non-nil."
699 (rx-check form)
ccfbe679 700 (setq form (rx-trans-forms form))
5dbe5c8f 701 (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
12c64503
GM
702 ((memq (car form) '(*? +? ??)) "?")
703 (rx-greedy-flag "")
704 (t "?")))
705 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
706 ((memq (car form) '(+ +? 1+ one-or-more)) "+")
5dbe5c8f
CY
707 (t "?"))))
708 (rx-group-if
709 (concat (rx-form (cadr form) '*) op suffix)
710 (and (memq rx-parent '(t *)) rx-parent))))
c53f9b3b 711
5dbe5c8f
CY
712
713(defun rx-atomic-p (r &optional lax)
c53f9b3b
RS
714 "Return non-nil if regexp string R is atomic.
715An atomic regexp R is one such that a suffix operator
716appended to R will apply to all of R. For example, \"a\"
717\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
718\"[ab]c\", and \"ab\\|ab*c\" are not atomic.
719
720This function may return false negatives, but it will not
721return false positives. It is nevertheless useful in
ab2d877d 722situations where an efficiency shortcut can be taken only if a
c53f9b3b
RS
723regexp is atomic. The function can be improved to detect
724more cases of atomic regexps. Presently, this function
725detects the following categories of atomic regexp;
726
727 a group or shy group: \\(...\\)
728 a character class: [...]
729 a single character: a
730
731On the other hand, false negatives will be returned for
732regexps that are atomic but end in operators, such as
733\"a+\". I think these are rare. Probably such cases could
734be detected without much effort. A guarantee of no false
735negatives would require a theoretic specification of the set
736of all atomic regexps."
737 (let ((l (length r)))
5dbe5c8f
CY
738 (cond
739 ((<= l 1))
740 ((= l 2) (= (aref r 0) ?\\))
741 ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
742 ((null lax)
743 (cond
744 ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r))
745 ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r)))))))
12c64503
GM
746
747
748(defun rx-syntax (form)
749 "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
750 (rx-check form)
09c774f7
SM
751 (let* ((sym (cadr form))
752 (syntax (assq sym rx-syntax)))
12c64503 753 (unless syntax
09c774f7
SM
754 ;; Try sregex compatibility.
755 (let ((name (symbol-name sym)))
756 (if (= 1 (length name))
757 (setq syntax (rassq (aref name 0) rx-syntax))))
758 (unless syntax
759 (error "Unknown rx syntax `%s'" (cadr form))))
12c64503
GM
760 (format "\\s%c" (cdr syntax))))
761
762
763(defun rx-check-category (form)
764 "Check the argument FORM of a `(category FORM)'."
765 (unless (or (integerp form)
766 (cdr (assq form rx-categories)))
767 (error "Unknown category `%s'" form))
768 t)
a1506d29 769
12c64503
GM
770
771(defun rx-category (form)
ccfbe679 772 "Parse and produce code from FORM, which is `(category SYMBOL)'."
12c64503
GM
773 (rx-check form)
774 (let ((char (if (integerp (cadr form))
775 (cadr form)
776 (cdr (assq (cadr form) rx-categories)))))
777 (format "\\c%c" char)))
778
779
780(defun rx-eval (form)
781 "Parse and produce code from FORM, which is `(eval FORM)'."
782 (rx-check form)
5dbe5c8f 783 (rx-form (eval (cadr form)) rx-parent))
12c64503
GM
784
785
786(defun rx-greedy (form)
740b7c2d
EZ
787 "Parse and produce code from FORM.
788If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
789`+', and `?' operators will be used in FORM1. If FORM is
790'(maximal-match FORM1)', greedy operators will be used."
12c64503
GM
791 (rx-check form)
792 (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
5dbe5c8f 793 (rx-form (cadr form) rx-parent)))
12c64503
GM
794
795
796(defun rx-regexp (form)
797 "Parse and produce code from FORM, which is `(regexp STRING)'."
798 (rx-check form)
5dbe5c8f
CY
799 (rx-group-if (cadr form) rx-parent))
800
801
802(defun rx-form (form &optional rx-parent)
803 "Parse and produce code for regular expression FORM.
804FORM is a regular expression in sexp form.
805RX-PARENT shows which type of expression calls and controls putting of
806shy groups around the result and some more in other functions."
807 (if (stringp form)
808 (rx-group-if (regexp-quote form)
809 (if (and (eq rx-parent '*) (< 1 (length form)))
810 rx-parent))
811 (cond ((integerp form)
812 (regexp-quote (char-to-string form)))
813 ((symbolp form)
814 (let ((info (rx-info form)))
815 (cond ((stringp info)
816 info)
817 ((null info)
818 (error "Unknown rx form `%s'" form))
819 (t
820 (funcall (nth 0 info) form)))))
821 ((consp form)
822 (let ((info (rx-info (car form))))
823 (unless (consp info)
824 (error "Unknown rx form `%s'" (car form)))
825 (funcall (nth 0 info) form)))
826 (t
827 (error "rx syntax error at `%s'" form)))))
12c64503
GM
828
829
830;;;###autoload
831(defun rx-to-string (form &optional no-group)
832 "Parse and produce code for regular expression FORM.
833FORM is a regular expression in sexp form.
834NO-GROUP non-nil means don't put shy groups around the result."
5dbe5c8f 835 (rx-group-if (rx-form form) (null no-group)))
12c64503
GM
836
837
838;;;###autoload
ccfbe679
SM
839(defmacro rx (&rest regexps)
840 "Translate regular expressions REGEXPS in sexp form to a regexp string.
841REGEXPS is a non-empty sequence of forms of the sort listed below.
baac7510
CY
842
843Note that `rx' is a Lisp macro; when used in a Lisp program being
844 compiled, the translation is performed by the compiler.
845See `rx-to-string' for how to do such a translation at run-time.
12c64503
GM
846
847The following are valid subforms of regular expressions in sexp
848notation.
849
850STRING
851 matches string STRING literally.
852
853CHAR
854 matches character CHAR literally.
855
ccfbe679 856`not-newline', `nonl'
12c64503 857 matches any character except a newline.
e8449cdb 858
12c64503
GM
859`anything'
860 matches any character
861
ccfbe679
SM
862`(any SET ...)'
863`(in SET ...)'
864`(char SET ...)'
865 matches any character in SET .... SET may be a character or string.
12c64503 866 Ranges of characters can be specified as `A-Z' in strings.
ccfbe679 867 Ranges may also be specified as conses like `(?A . ?Z)'.
12c64503 868
ccfbe679
SM
869 SET may also be the name of a character class: `digit',
870 `control', `hex-digit', `blank', `graph', `print', `alnum',
871 `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
872 `word', or one of their synonyms.
12c64503 873
ccfbe679
SM
874`(not (any SET ...))'
875 matches any character not in SET ...
12c64503 876
ccfbe679 877`line-start', `bol'
12c64503
GM
878 matches the empty string, but only at the beginning of a line
879 in the text being matched
880
ccfbe679 881`line-end', `eol'
12c64503
GM
882 is similar to `line-start' but matches only at the end of a line
883
ccfbe679 884`string-start', `bos', `bot'
12c64503
GM
885 matches the empty string, but only at the beginning of the
886 string being matched against.
887
ccfbe679 888`string-end', `eos', `eot'
12c64503
GM
889 matches the empty string, but only at the end of the
890 string being matched against.
891
892`buffer-start'
893 matches the empty string, but only at the beginning of the
ccfbe679 894 buffer being matched against. Actually equivalent to `string-start'.
12c64503
GM
895
896`buffer-end'
897 matches the empty string, but only at the end of the
ccfbe679 898 buffer being matched against. Actually equivalent to `string-end'.
12c64503
GM
899
900`point'
901 matches the empty string, but only at point.
902
ccfbe679 903`word-start', `bow'
5e3fc9eb 904 matches the empty string, but only at the beginning of a word.
12c64503 905
ccfbe679 906`word-end', `eow'
12c64503
GM
907 matches the empty string, but only at the end of a word.
908
909`word-boundary'
910 matches the empty string, but only at the beginning or end of a
911 word.
912
913`(not word-boundary)'
ccfbe679 914`not-word-boundary'
12c64503
GM
915 matches the empty string, but not at the beginning or end of a
916 word.
917
5e3fc9eb
GM
918`symbol-start'
919 matches the empty string, but only at the beginning of a symbol.
920
921`symbol-end'
922 matches the empty string, but only at the end of a symbol.
923
ccfbe679 924`digit', `numeric', `num'
12c64503
GM
925 matches 0 through 9.
926
ccfbe679 927`control', `cntrl'
12c64503
GM
928 matches ASCII control characters.
929
ccfbe679 930`hex-digit', `hex', `xdigit'
12c64503
GM
931 matches 0 through 9, a through f and A through F.
932
933`blank'
934 matches space and tab only.
935
ccfbe679 936`graphic', `graph'
12c64503
GM
937 matches graphic characters--everything except ASCII control chars,
938 space, and DEL.
939
ccfbe679 940`printing', `print'
12c64503
GM
941 matches printing characters--everything except ASCII control chars
942 and DEL.
943
ccfbe679 944`alphanumeric', `alnum'
12c64503
GM
945 matches letters and digits. (But at present, for multibyte characters,
946 it matches anything that has word syntax.)
947
ccfbe679 948`letter', `alphabetic', `alpha'
12c64503
GM
949 matches letters. (But at present, for multibyte characters,
950 it matches anything that has word syntax.)
951
952`ascii'
953 matches ASCII (unibyte) characters.
954
955`nonascii'
956 matches non-ASCII (multibyte) characters.
957
ccfbe679 958`lower', `lower-case'
12c64503
GM
959 matches anything lower-case.
960
ccfbe679 961`upper', `upper-case'
12c64503
GM
962 matches anything upper-case.
963
ccfbe679 964`punctuation', `punct'
12c64503
GM
965 matches punctuation. (But at present, for multibyte characters,
966 it matches anything that has non-word syntax.)
967
ccfbe679 968`space', `whitespace', `white'
12c64503
GM
969 matches anything that has whitespace syntax.
970
ccfbe679 971`word', `wordchar'
12c64503
GM
972 matches anything that has word syntax.
973
ccfbe679
SM
974`not-wordchar'
975 matches anything that has non-word syntax.
976
12c64503
GM
977`(syntax SYNTAX)'
978 matches a character with syntax SYNTAX. SYNTAX must be one
ccfbe679
SM
979 of the following symbols, or a symbol corresponding to the syntax
980 character, e.g. `\\.' for `\\s.'.
12c64503
GM
981
982 `whitespace' (\\s- in string notation)
983 `punctuation' (\\s.)
984 `word' (\\sw)
985 `symbol' (\\s_)
986 `open-parenthesis' (\\s()
987 `close-parenthesis' (\\s))
988 `expression-prefix' (\\s')
989 `string-quote' (\\s\")
990 `paired-delimiter' (\\s$)
991 `escape' (\\s\\)
992 `character-quote' (\\s/)
993 `comment-start' (\\s<)
994 `comment-end' (\\s>)
740b7c2d
EZ
995 `string-delimiter' (\\s|)
996 `comment-delimiter' (\\s!)
12c64503
GM
997
998`(not (syntax SYNTAX))'
ccfbe679 999 matches a character that doesn't have syntax SYNTAX.
12c64503
GM
1000
1001`(category CATEGORY)'
1002 matches a character with category CATEGORY. CATEGORY must be
1003 either a character to use for C, or one of the following symbols.
1004
1005 `consonant' (\\c0 in string notation)
1006 `base-vowel' (\\c1)
1007 `upper-diacritical-mark' (\\c2)
1008 `lower-diacritical-mark' (\\c3)
1009 `tone-mark' (\\c4)
1010 `symbol' (\\c5)
1011 `digit' (\\c6)
1012 `vowel-modifying-diacritical-mark' (\\c7)
1013 `vowel-sign' (\\c8)
1014 `semivowel-lower' (\\c9)
1015 `not-at-end-of-line' (\\c<)
1016 `not-at-beginning-of-line' (\\c>)
1017 `alpha-numeric-two-byte' (\\cA)
1018 `chinse-two-byte' (\\cC)
1019 `greek-two-byte' (\\cG)
1020 `japanese-hiragana-two-byte' (\\cH)
1021 `indian-tow-byte' (\\cI)
1022 `japanese-katakana-two-byte' (\\cK)
1023 `korean-hangul-two-byte' (\\cN)
1024 `cyrillic-two-byte' (\\cY)
ccfbe679 1025 `combining-diacritic' (\\c^)
12c64503
GM
1026 `ascii' (\\ca)
1027 `arabic' (\\cb)
1028 `chinese' (\\cc)
1029 `ethiopic' (\\ce)
1030 `greek' (\\cg)
1031 `korean' (\\ch)
1032 `indian' (\\ci)
1033 `japanese' (\\cj)
1034 `japanese-katakana' (\\ck)
1035 `latin' (\\cl)
1036 `lao' (\\co)
1037 `tibetan' (\\cq)
1038 `japanese-roman' (\\cr)
1039 `thai' (\\ct)
1040 `vietnamese' (\\cv)
1041 `hebrew' (\\cw)
1042 `cyrillic' (\\cy)
1043 `can-break' (\\c|)
1044
1045`(not (category CATEGORY))'
ccfbe679 1046 matches a character that doesn't have category CATEGORY.
12c64503
GM
1047
1048`(and SEXP1 SEXP2 ...)'
ccfbe679
SM
1049`(: SEXP1 SEXP2 ...)'
1050`(seq SEXP1 SEXP2 ...)'
1051`(sequence SEXP1 SEXP2 ...)'
12c64503
GM
1052 matches what SEXP1 matches, followed by what SEXP2 matches, etc.
1053
1054`(submatch SEXP1 SEXP2 ...)'
ccfbe679 1055`(group SEXP1 SEXP2 ...)'
12c64503
GM
1056 like `and', but makes the match accessible with `match-end',
1057 `match-beginning', and `match-string'.
1058
12c64503 1059`(or SEXP1 SEXP2 ...)'
ccfbe679 1060`(| SEXP1 SEXP2 ...)'
12c64503
GM
1061 matches anything that matches SEXP1 or SEXP2, etc. If all
1062 args are strings, use `regexp-opt' to optimize the resulting
1063 regular expression.
1064
1065`(minimal-match SEXP)'
1066 produce a non-greedy regexp for SEXP. Normally, regexps matching
740b7c2d 1067 zero or more occurrences of something are \"greedy\" in that they
12c64503
GM
1068 match as much as they can, as long as the overall regexp can
1069 still match. A non-greedy regexp matches as little as possible.
1070
1071`(maximal-match SEXP)'
0a6cac62 1072 produce a greedy regexp for SEXP. This is the default.
12c64503 1073
ccfbe679
SM
1074Below, `SEXP ...' represents a sequence of regexp forms, treated as if
1075enclosed in `(and ...)'.
12c64503 1076
ccfbe679
SM
1077`(zero-or-more SEXP ...)'
1078`(0+ SEXP ...)'
1079 matches zero or more occurrences of what SEXP ... matches.
12c64503 1080
ccfbe679
SM
1081`(* SEXP ...)'
1082 like `zero-or-more', but always produces a greedy regexp, independent
1083 of `rx-greedy-flag'.
12c64503 1084
ccfbe679
SM
1085`(*? SEXP ...)'
1086 like `zero-or-more', but always produces a non-greedy regexp,
1087 independent of `rx-greedy-flag'.
a1506d29 1088
ccfbe679
SM
1089`(one-or-more SEXP ...)'
1090`(1+ SEXP ...)'
1091 matches one or more occurrences of SEXP ...
12c64503 1092
ccfbe679 1093`(+ SEXP ...)'
12c64503
GM
1094 like `one-or-more', but always produces a greedy regexp.
1095
ccfbe679 1096`(+? SEXP ...)'
12c64503
GM
1097 like `one-or-more', but always produces a non-greedy regexp.
1098
ccfbe679
SM
1099`(zero-or-one SEXP ...)'
1100`(optional SEXP ...)'
1101`(opt SEXP ...)'
12c64503 1102 matches zero or one occurrences of A.
a1506d29 1103
ccfbe679 1104`(? SEXP ...)'
12c64503
GM
1105 like `zero-or-one', but always produces a greedy regexp.
1106
ccfbe679 1107`(?? SEXP ...)'
12c64503
GM
1108 like `zero-or-one', but always produces a non-greedy regexp.
1109
1110`(repeat N SEXP)'
ccfbe679
SM
1111`(= N SEXP ...)'
1112 matches N occurrences.
1113
1114`(>= N SEXP ...)'
1115 matches N or more occurrences.
12c64503
GM
1116
1117`(repeat N M SEXP)'
ccfbe679
SM
1118`(** N M SEXP ...)'
1119 matches N to M occurrences.
1120
942269e7
JB
1121`(backref N)'
1122 matches what was matched previously by submatch N.
1123
12c64503 1124`(eval FORM)'
942269e7
JB
1125 evaluate FORM and insert result. If result is a string,
1126 `regexp-quote' it.
12c64503
GM
1127
1128`(regexp REGEXP)'
942269e7 1129 include REGEXP in string notation in the result."
ccfbe679
SM
1130 (cond ((null regexps)
1131 (error "No regexp"))
1132 ((cdr regexps)
1133 (rx-to-string `(and ,@regexps) t))
1134 (t
1135 (rx-to-string (car regexps) t))))
1136\f
1137;; ;; sregex.el replacement
1138
1139;; ;;;###autoload (provide 'sregex)
1140;; ;;;###autoload (autoload 'sregex "rx")
1141;; (defalias 'sregex 'rx-to-string)
1142;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
1143;; (defalias 'sregexq 'rx)
1144\f
12c64503
GM
1145(provide 'rx)
1146
b62c13c2 1147;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
12c64503 1148;;; rx.el ends here