Commit | Line | Data |
---|---|---|
12c64503 GM |
1 | ;;; rx.el --- sexp notation for regular expressions |
2 | ||
ba318903 | 3 | ;; Copyright (C) 2001-2014 Free Software Foundation, Inc. |
12c64503 GM |
4 | |
5 | ;; Author: Gerd Moellmann <gerd@gnu.org> | |
34dc21db | 6 | ;; Maintainer: emacs-devel@gnu.org |
12c64503 GM |
7 | ;; Keywords: strings, regexps, extensions |
8 | ||
9 | ;; This file is part of GNU Emacs. | |
10 | ||
d6cba7ae | 11 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
12c64503 | 12 | ;; it under the terms of the GNU General Public License as published by |
d6cba7ae GM |
13 | ;; the Free Software Foundation, either version 3 of the License, or |
14 | ;; (at your option) any later version. | |
12c64503 GM |
15 | |
16 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
17 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | ;; GNU General Public License for more details. | |
20 | ||
21 | ;; You should have received a copy of the GNU General Public License | |
d6cba7ae | 22 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
12c64503 GM |
23 | |
24 | ;;; Commentary: | |
25 | ||
26 | ;; This is another implementation of sexp-form regular expressions. | |
27 | ;; It was unfortunately written without being aware of the Sregex | |
28 | ;; package coming with Emacs, but as things stand, Rx completely | |
29 | ;; covers all regexp features, which Sregex doesn't, doesn't suffer | |
30 | ;; from the bugs mentioned in the commentary section of Sregex, and | |
31 | ;; uses a nicer syntax (IMHO, of course :-). | |
32 | ||
ccfbe679 SM |
33 | ;; This significantly extended version of the original, is almost |
34 | ;; compatible with Sregex. The only incompatibility I (fx) know of is | |
35 | ;; that the `repeat' form can't have multiple regexp args. | |
36 | ||
37 | ;; Now alternative forms are provided for a degree of compatibility | |
f47188fe CY |
38 | ;; with Olin Shivers' attempted definitive SRE notation. SRE forms |
39 | ;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>, | |
ccfbe679 SM |
40 | ;; ,<exp>, (word ...), word+, posix-string, and character class forms. |
41 | ;; Some forms are inconsistent with SRE, either for historical reasons | |
42 | ;; or because of the implementation -- simple translation into Emacs | |
43 | ;; regexp strings. These include: any, word. Also, case-sensitivity | |
44 | ;; and greediness are controlled by variables external to the regexp, | |
45 | ;; and you need to feed the forms to the `posix-' functions to get | |
46 | ;; SRE's POSIX semantics. There are probably more difficulties. | |
47 | ||
12c64503 GM |
48 | ;; Rx translates a sexp notation for regular expressions into the |
49 | ;; usual string notation. The translation can be done at compile-time | |
50 | ;; by using the `rx' macro. It can be done at run-time by calling | |
51 | ;; function `rx-to-string'. See the documentation of `rx' for a | |
52 | ;; complete description of the sexp notation. | |
53 | ;; | |
54 | ;; Some examples of string regexps and their sexp counterparts: | |
55 | ;; | |
56 | ;; "^[a-z]*" | |
57 | ;; (rx (and line-start (0+ (in "a-z")))) | |
58 | ;; | |
59 | ;; "\n[^ \t]" | |
60 | ;; (rx (and "\n" (not blank))), or | |
61 | ;; (rx (and "\n" (not (any " \t")))) | |
62 | ;; | |
63 | ;; "\\*\\*\\* EOOH \\*\\*\\*\n" | |
64 | ;; (rx "*** EOOH ***\n") | |
65 | ;; | |
66 | ;; "\\<\\(catch\\|finally\\)\\>[^_]" | |
67 | ;; (rx (and word-start (submatch (or "catch" "finally")) word-end | |
68 | ;; (not (any ?_)))) | |
69 | ;; | |
70 | ;; "[ \t\n]*:\\([^:]+\\|$\\)" | |
71 | ;; (rx (and (zero-or-more (in " \t\n")) ":" | |
72 | ;; (submatch (or line-end (one-or-more (not (any ?:))))))) | |
73 | ;; | |
74 | ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*" | |
75 | ;; (rx (and line-start | |
76 | ;; "content-transfer-encoding:" | |
c53f9b3b | 77 | ;; (+ (? ?\n)) blank |
12c64503 | 78 | ;; "quoted-printable" |
c53f9b3b | 79 | ;; (+ (? ?\n)) blank)) |
12c64503 GM |
80 | ;; |
81 | ;; (concat "^\\(?:" something-else "\\)") | |
82 | ;; (rx (and line-start (eval something-else))), statically or | |
83 | ;; (rx-to-string '(and line-start ,something-else)), dynamically. | |
84 | ;; | |
85 | ;; (regexp-opt '(STRING1 STRING2 ...)) | |
86 | ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically | |
87 | ;; calls `regexp-opt' as needed. | |
88 | ;; | |
89 | ;; "^;;\\s-*\n\\|^\n" | |
90 | ;; (rx (or (and line-start ";;" (0+ space) ?\n) | |
91 | ;; (and line-start ?\n))) | |
92 | ;; | |
93 | ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " | |
a1506d29 JB |
94 | ;; (rx (and "$Id: " |
95 | ;; (1+ (not (in " "))) | |
12c64503 GM |
96 | ;; " " |
97 | ;; (submatch (1+ (not (in " ")))) | |
c53f9b3b | 98 | ;; " ")) |
12c64503 GM |
99 | ;; |
100 | ;; "\\\\\\\\\\[\\w+" | |
101 | ;; (rx (and ?\\ ?\\ ?\[ (1+ word))) | |
102 | ;; | |
103 | ;; etc. | |
104 | ||
105 | ;;; History: | |
a1506d29 | 106 | ;; |
12c64503 GM |
107 | |
108 | ;;; Code: | |
109 | ||
daa9f1a6 SM |
110 | ;; FIXME: support macros. |
111 | ||
112 | (defvar rx-constituents ;Not `const' because some modes extend it. | |
12c64503 | 113 | '((and . (rx-and 1 nil)) |
ccfbe679 SM |
114 | (seq . and) ; SRE |
115 | (: . and) ; SRE | |
116 | (sequence . and) ; sregex | |
12c64503 | 117 | (or . (rx-or 1 nil)) |
ccfbe679 | 118 | (| . or) ; SRE |
12c64503 | 119 | (not-newline . ".") |
ccfbe679 | 120 | (nonl . not-newline) ; SRE |
5dbe5c8f | 121 | (anything . (rx-anything 0 nil)) |
ccfbe679 | 122 | (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE |
723ee192 | 123 | (any . ".") ; sregex |
12c64503 | 124 | (in . any) |
ccfbe679 SM |
125 | (char . any) ; sregex |
126 | (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex | |
12c64503 | 127 | (not . (rx-not 1 1 rx-check-not)) |
723ee192 | 128 | (repeat . (rx-repeat 2 nil)) |
ccfbe679 SM |
129 | (= . (rx-= 2 nil)) ; SRE |
130 | (>= . (rx->= 2 nil)) ; SRE | |
131 | (** . (rx-** 2 nil)) ; SRE | |
132 | (submatch . (rx-submatch 1 nil)) ; SRE | |
723ee192 | 133 | (group . submatch) ; sregex |
6420d28b CY |
134 | (submatch-n . (rx-submatch-n 2 nil)) |
135 | (group-n . submatch-n) | |
ccfbe679 SM |
136 | (zero-or-more . (rx-kleene 1 nil)) |
137 | (one-or-more . (rx-kleene 1 nil)) | |
138 | (zero-or-one . (rx-kleene 1 nil)) | |
139 | (\? . zero-or-one) ; SRE | |
12c64503 | 140 | (\?? . zero-or-one) |
ccfbe679 | 141 | (* . zero-or-more) ; SRE |
12c64503 GM |
142 | (*? . zero-or-more) |
143 | (0+ . zero-or-more) | |
ccfbe679 | 144 | (+ . one-or-more) ; SRE |
12c64503 GM |
145 | (+? . one-or-more) |
146 | (1+ . one-or-more) | |
147 | (optional . zero-or-one) | |
ccfbe679 | 148 | (opt . zero-or-one) ; sregex |
12c64503 GM |
149 | (minimal-match . (rx-greedy 1 1)) |
150 | (maximal-match . (rx-greedy 1 1)) | |
740b7c2d | 151 | (backref . (rx-backref 1 1 rx-check-backref)) |
12c64503 | 152 | (line-start . "^") |
ccfbe679 | 153 | (bol . line-start) ; SRE |
12c64503 | 154 | (line-end . "$") |
ccfbe679 | 155 | (eol . line-end) ; SRE |
12c64503 | 156 | (string-start . "\\`") |
ccfbe679 SM |
157 | (bos . string-start) ; SRE |
158 | (bot . string-start) ; sregex | |
12c64503 | 159 | (string-end . "\\'") |
ccfbe679 SM |
160 | (eos . string-end) ; SRE |
161 | (eot . string-end) ; sregex | |
12c64503 GM |
162 | (buffer-start . "\\`") |
163 | (buffer-end . "\\'") | |
164 | (point . "\\=") | |
165 | (word-start . "\\<") | |
ccfbe679 | 166 | (bow . word-start) ; SRE |
12c64503 | 167 | (word-end . "\\>") |
ccfbe679 | 168 | (eow . word-end) ; SRE |
12c64503 | 169 | (word-boundary . "\\b") |
ccfbe679 | 170 | (not-word-boundary . "\\B") ; sregex |
b62c13c2 SM |
171 | (symbol-start . "\\_<") |
172 | (symbol-end . "\\_>") | |
12c64503 | 173 | (syntax . (rx-syntax 1 1)) |
ccfbe679 | 174 | (not-syntax . (rx-not-syntax 1 1)) ; sregex |
12c64503 GM |
175 | (category . (rx-category 1 1 rx-check-category)) |
176 | (eval . (rx-eval 1 1)) | |
177 | (regexp . (rx-regexp 1 1 stringp)) | |
723ee192 | 178 | (regex . regexp) ; sregex |
12c64503 | 179 | (digit . "[[:digit:]]") |
ccfbe679 SM |
180 | (numeric . digit) ; SRE |
181 | (num . digit) ; SRE | |
182 | (control . "[[:cntrl:]]") ; SRE | |
183 | (cntrl . control) ; SRE | |
184 | (hex-digit . "[[:xdigit:]]") ; SRE | |
185 | (hex . hex-digit) ; SRE | |
186 | (xdigit . hex-digit) ; SRE | |
187 | (blank . "[[:blank:]]") ; SRE | |
188 | (graphic . "[[:graph:]]") ; SRE | |
189 | (graph . graphic) ; SRE | |
190 | (printing . "[[:print:]]") ; SRE | |
191 | (print . printing) ; SRE | |
192 | (alphanumeric . "[[:alnum:]]") ; SRE | |
193 | (alnum . alphanumeric) ; SRE | |
12c64503 | 194 | (letter . "[[:alpha:]]") |
ccfbe679 SM |
195 | (alphabetic . letter) ; SRE |
196 | (alpha . letter) ; SRE | |
197 | (ascii . "[[:ascii:]]") ; SRE | |
12c64503 | 198 | (nonascii . "[[:nonascii:]]") |
ccfbe679 SM |
199 | (lower . "[[:lower:]]") ; SRE |
200 | (lower-case . lower) ; SRE | |
201 | (punctuation . "[[:punct:]]") ; SRE | |
202 | (punct . punctuation) ; SRE | |
203 | (space . "[[:space:]]") ; SRE | |
204 | (whitespace . space) ; SRE | |
205 | (white . space) ; SRE | |
206 | (upper . "[[:upper:]]") ; SRE | |
207 | (upper-case . upper) ; SRE | |
208 | (word . "[[:word:]]") ; inconsistent with SRE | |
209 | (wordchar . word) ; sregex | |
5dbe5c8f | 210 | (not-wordchar . "\\W")) |
12c64503 GM |
211 | "Alist of sexp form regexp constituents. |
212 | Each element of the alist has the form (SYMBOL . DEFN). | |
213 | SYMBOL is a valid constituent of sexp regular expressions. | |
214 | If DEFN is a string, SYMBOL is translated into DEFN. | |
215 | If DEFN is a symbol, use the definition of DEFN, recursively. | |
216 | Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). | |
217 | FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS | |
218 | are the minimum and maximum number of arguments the function-form | |
219 | sexp constituent SYMBOL may have in sexp regular expressions. | |
220 | MAX-ARGS nil means no limit. PREDICATE, if specified, means that | |
221 | all arguments must satisfy PREDICATE.") | |
222 | ||
223 | ||
224 | (defconst rx-syntax | |
225 | '((whitespace . ?-) | |
226 | (punctuation . ?.) | |
227 | (word . ?w) | |
228 | (symbol . ?_) | |
229 | (open-parenthesis . ?\() | |
230 | (close-parenthesis . ?\)) | |
231 | (expression-prefix . ?\') | |
232 | (string-quote . ?\") | |
233 | (paired-delimiter . ?$) | |
234 | (escape . ?\\) | |
235 | (character-quote . ?/) | |
236 | (comment-start . ?<) | |
740b7c2d EZ |
237 | (comment-end . ?>) |
238 | (string-delimiter . ?|) | |
09c774f7 | 239 | (comment-delimiter . ?!)) |
12c64503 GM |
240 | "Alist mapping Rx syntax symbols to syntax characters. |
241 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | |
242 | symbol in `(syntax SYMBOL)', and CHAR is the syntax character | |
243 | corresponding to SYMBOL, as it would be used with \\s or \\S in | |
244 | regular expressions.") | |
245 | ||
246 | ||
247 | (defconst rx-categories | |
248 | '((consonant . ?0) | |
249 | (base-vowel . ?1) | |
250 | (upper-diacritical-mark . ?2) | |
251 | (lower-diacritical-mark . ?3) | |
252 | (tone-mark . ?4) | |
253 | (symbol . ?5) | |
254 | (digit . ?6) | |
255 | (vowel-modifying-diacritical-mark . ?7) | |
256 | (vowel-sign . ?8) | |
257 | (semivowel-lower . ?9) | |
258 | (not-at-end-of-line . ?<) | |
259 | (not-at-beginning-of-line . ?>) | |
260 | (alpha-numeric-two-byte . ?A) | |
73dbf960 PE |
261 | (chinese-two-byte . ?C) |
262 | (chinse-two-byte . ?C) ;; A typo in Emacs 21.1-24.3. | |
12c64503 GM |
263 | (greek-two-byte . ?G) |
264 | (japanese-hiragana-two-byte . ?H) | |
265 | (indian-two-byte . ?I) | |
266 | (japanese-katakana-two-byte . ?K) | |
267 | (korean-hangul-two-byte . ?N) | |
268 | (cyrillic-two-byte . ?Y) | |
740b7c2d | 269 | (combining-diacritic . ?^) |
12c64503 GM |
270 | (ascii . ?a) |
271 | (arabic . ?b) | |
272 | (chinese . ?c) | |
273 | (ethiopic . ?e) | |
274 | (greek . ?g) | |
275 | (korean . ?h) | |
276 | (indian . ?i) | |
277 | (japanese . ?j) | |
278 | (japanese-katakana . ?k) | |
279 | (latin . ?l) | |
280 | (lao . ?o) | |
281 | (tibetan . ?q) | |
282 | (japanese-roman . ?r) | |
283 | (thai . ?t) | |
284 | (vietnamese . ?v) | |
285 | (hebrew . ?w) | |
286 | (cyrillic . ?y) | |
287 | (can-break . ?|)) | |
288 | "Alist mapping symbols to category characters. | |
289 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | |
290 | symbol in `(category SYMBOL)', and CHAR is the category character | |
291 | corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in | |
292 | regular expression strings.") | |
293 | ||
294 | ||
295 | (defvar rx-greedy-flag t | |
296 | "Non-nil means produce greedy regular expressions for `zero-or-one', | |
297 | `zero-or-more', and `one-or-more'. Dynamically bound.") | |
298 | ||
299 | ||
723ee192 | 300 | (defun rx-info (op head) |
12c64503 GM |
301 | "Return parsing/code generation info for OP. |
302 | If OP is the space character ASCII 32, return info for the symbol `?'. | |
303 | If OP is the character `?', return info for the symbol `??'. | |
723ee192 SM |
304 | See also `rx-constituents'. |
305 | If HEAD is non-nil, then OP is the head of a sexp, otherwise it's | |
306 | a standalone symbol." | |
12c64503 GM |
307 | (cond ((eq op ? ) (setq op '\?)) |
308 | ((eq op ??) (setq op '\??))) | |
723ee192 SM |
309 | (let (old-op) |
310 | (while (and (not (null op)) (symbolp op)) | |
311 | (setq old-op op) | |
312 | (setq op (cdr (assq op rx-constituents))) | |
313 | (when (if head (stringp op) (consp op)) | |
314 | ;; We found something but of the wrong kind. Let's look for an | |
315 | ;; alternate definition for the other case. | |
316 | (let ((new-op | |
317 | (cdr (assq old-op (cdr (memq (assq old-op rx-constituents) | |
318 | rx-constituents)))))) | |
319 | (if (and new-op (not (if head (stringp new-op) (consp new-op)))) | |
320 | (setq op new-op)))))) | |
12c64503 | 321 | op) |
a1506d29 | 322 | |
12c64503 GM |
323 | |
324 | (defun rx-check (form) | |
325 | "Check FORM according to its car's parsing info." | |
ccfbe679 SM |
326 | (unless (listp form) |
327 | (error "rx `%s' needs argument(s)" form)) | |
723ee192 | 328 | (let* ((rx (rx-info (car form) 'head)) |
12c64503 GM |
329 | (nargs (1- (length form))) |
330 | (min-args (nth 1 rx)) | |
331 | (max-args (nth 2 rx)) | |
332 | (type-pred (nth 3 rx))) | |
333 | (when (and (not (null min-args)) | |
334 | (< nargs min-args)) | |
740b7c2d | 335 | (error "rx form `%s' requires at least %d args" |
12c64503 GM |
336 | (car form) min-args)) |
337 | (when (and (not (null max-args)) | |
338 | (> nargs max-args)) | |
740b7c2d | 339 | (error "rx form `%s' accepts at most %d args" |
12c64503 GM |
340 | (car form) max-args)) |
341 | (when (not (null type-pred)) | |
342 | (dolist (sub-form (cdr form)) | |
343 | (unless (funcall type-pred sub-form) | |
740b7c2d | 344 | (error "rx form `%s' requires args satisfying `%s'" |
12c64503 GM |
345 | (car form) type-pred)))))) |
346 | ||
347 | ||
5dbe5c8f CY |
348 | (defun rx-group-if (regexp group) |
349 | "Put shy groups around REGEXP if seemingly necessary when GROUP | |
350 | is non-nil." | |
351 | (cond | |
352 | ;; for some repetition | |
353 | ((eq group '*) (if (rx-atomic-p regexp) (setq group nil))) | |
354 | ;; for concatenation | |
355 | ((eq group ':) | |
356 | (if (rx-atomic-p | |
357 | (if (string-match | |
358 | "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp) | |
359 | (substring regexp 0 (match-beginning 0)) | |
360 | regexp)) | |
361 | (setq group nil))) | |
362 | ;; for OR | |
363 | ((eq group '|) (setq group nil)) | |
364 | ;; do anyway | |
365 | ((eq group t)) | |
366 | ((rx-atomic-p regexp t) (setq group nil))) | |
367 | (if group | |
368 | (concat "\\(?:" regexp "\\)") | |
369 | regexp)) | |
370 | ||
371 | ||
372 | (defvar rx-parent) | |
373 | ;; dynamically bound in some functions. | |
374 | ||
375 | ||
12c64503 GM |
376 | (defun rx-and (form) |
377 | "Parse and produce code from FORM. | |
378 | FORM is of the form `(and FORM1 ...)'." | |
379 | (rx-check form) | |
5dbe5c8f CY |
380 | (rx-group-if |
381 | (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil) | |
382 | (and (memq rx-parent '(* t)) rx-parent))) | |
12c64503 GM |
383 | |
384 | ||
385 | (defun rx-or (form) | |
386 | "Parse and produce code from FORM, which is `(or FORM1 ...)'." | |
387 | (rx-check form) | |
5dbe5c8f CY |
388 | (rx-group-if |
389 | (if (memq nil (mapcar 'stringp (cdr form))) | |
390 | (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|") | |
391 | (regexp-opt (cdr form))) | |
392 | (and (memq rx-parent '(: * t)) rx-parent))) | |
393 | ||
394 | ||
395 | (defun rx-anything (form) | |
396 | "Match any character." | |
397 | (if (consp form) | |
91af3942 | 398 | (error "rx `anything' syntax error: %s" form)) |
5dbe5c8f CY |
399 | (rx-or (list 'or 'not-newline ?\n))) |
400 | ||
401 | ||
402 | (defun rx-any-delete-from-range (char ranges) | |
403 | "Delete by side effect character CHAR from RANGES. | |
404 | Only both edges of each range is checked." | |
405 | (let (m) | |
406 | (cond | |
407 | ((memq char ranges) (setq ranges (delq char ranges))) | |
408 | ((setq m (assq char ranges)) | |
409 | (if (eq (1+ char) (cdr m)) | |
410 | (setcar (memq m ranges) (1+ char)) | |
411 | (setcar m (1+ char)))) | |
412 | ((setq m (rassq char ranges)) | |
413 | (if (eq (1- char) (car m)) | |
414 | (setcar (memq m ranges) (1- char)) | |
415 | (setcdr m (1- char))))) | |
416 | ranges)) | |
417 | ||
1f6f7722 | 418 | |
5dbe5c8f CY |
419 | (defun rx-any-condense-range (args) |
420 | "Condense by side effect ARGS as range for Rx `any'." | |
421 | (let (str | |
422 | l) | |
423 | ;; set STR list of all strings | |
424 | ;; set L list of all ranges | |
425 | (mapc (lambda (e) (cond ((stringp e) (push e str)) | |
426 | ((numberp e) (push (cons e e) l)) | |
427 | (t (push e l)))) | |
428 | args) | |
429 | ;; condense overlapped ranges in L | |
430 | (let ((tail (setq l (sort l #'car-less-than-car))) | |
431 | d) | |
432 | (while (setq d (cdr tail)) | |
433 | (if (>= (cdar tail) (1- (caar d))) | |
434 | (progn | |
435 | (setcdr (car tail) (max (cdar tail) (cdar d))) | |
436 | (setcdr tail (cdr d))) | |
437 | (setq tail d)))) | |
438 | ;; Separate small ranges to single number, and delete dups. | |
439 | (nconc | |
440 | (apply #'nconc | |
441 | (mapcar (lambda (e) | |
442 | (cond | |
443 | ((= (car e) (cdr e)) (list (car e))) | |
19fe13df | 444 | ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e))) |
5dbe5c8f CY |
445 | ((list e)))) |
446 | l)) | |
447 | (delete-dups str)))) | |
448 | ||
449 | ||
450 | (defun rx-check-any-string (str) | |
451 | "Check string argument STR for Rx `any'." | |
452 | (let ((i 0) | |
453 | c1 c2 l) | |
454 | (if (= 0 (length str)) | |
455 | (error "String arg for Rx `any' must not be empty")) | |
456 | (while (string-match ".-." str i) | |
457 | ;; string before range: convert it to characters | |
458 | (if (< i (match-beginning 0)) | |
459 | (setq l (nconc | |
460 | l | |
461 | (append (substring str i (match-beginning 0)) nil)))) | |
462 | ;; range | |
463 | (setq i (match-end 0) | |
464 | c1 (aref str (match-beginning 0)) | |
465 | c2 (aref str (1- i))) | |
466 | (cond | |
467 | ((< c1 c2) (setq l (nconc l (list (cons c1 c2))))) | |
468 | ((= c1 c2) (setq l (nconc l (list c1)))))) | |
469 | ;; rest? | |
470 | (if (< i (length str)) | |
471 | (setq l (nconc l (append (substring str i) nil)))) | |
472 | l)) | |
12c64503 | 473 | |
12c64503 GM |
474 | |
475 | (defun rx-check-any (arg) | |
476 | "Check arg ARG for Rx `any'." | |
5dbe5c8f CY |
477 | (cond |
478 | ((integerp arg) (list arg)) | |
479 | ((symbolp arg) | |
ccfbe679 | 480 | (let ((translation (condition-case nil |
5dbe5c8f | 481 | (rx-form arg) |
ccfbe679 | 482 | (error nil)))) |
5dbe5c8f CY |
483 | (if (or (null translation) |
484 | (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation))) | |
485 | (error "Invalid char class `%s' in Rx `any'" arg)) | |
486 | (list (substring translation 1 -1)))) ; strip outer brackets | |
487 | ((and (integerp (car-safe arg)) (integerp (cdr-safe arg))) | |
488 | (list arg)) | |
489 | ((stringp arg) (rx-check-any-string arg)) | |
490 | ((error | |
491 | "rx `any' requires string, character, char pair or char class args")))) | |
492 | ||
12c64503 GM |
493 | |
494 | (defun rx-any (form) | |
ccfbe679 SM |
495 | "Parse and produce code from FORM, which is `(any ARG ...)'. |
496 | ARG is optional." | |
12c64503 | 497 | (rx-check form) |
5dbe5c8f CY |
498 | (let* ((args (rx-any-condense-range |
499 | (apply | |
500 | #'nconc | |
501 | (mapcar #'rx-check-any (cdr form))))) | |
502 | m | |
503 | s) | |
504 | (cond | |
505 | ;; single close bracket | |
506 | ;; => "[]...-]" or "[]...--.]" | |
507 | ((memq ?\] args) | |
508 | ;; set ] at the beginning | |
509 | (setq args (cons ?\] (delq ?\] args))) | |
510 | ;; set - at the end | |
511 | (if (or (memq ?- args) (assq ?- args)) | |
512 | (setq args (nconc (rx-any-delete-from-range ?- args) | |
513 | (list ?-))))) | |
514 | ;; close bracket starts a range | |
515 | ;; => "[]-....-]" or "[]-.--....]" | |
516 | ((setq m (assq ?\] args)) | |
517 | ;; bring it to the beginning | |
518 | (setq args (cons m (delq m args))) | |
519 | (cond ((memq ?- args) | |
520 | ;; to the end | |
521 | (setq args (nconc (delq ?- args) (list ?-)))) | |
522 | ((setq m (assq ?- args)) | |
523 | ;; next to the bracket's range, make the second range | |
524 | (setcdr args (cons m (delq m args)))))) | |
525 | ;; bracket in the end range | |
526 | ;; => "[]...-]" | |
527 | ((setq m (rassq ?\] args)) | |
528 | ;; set ] at the beginning | |
529 | (setq args (cons ?\] (rx-any-delete-from-range ?\] args))) | |
530 | ;; set - at the end | |
531 | (if (or (memq ?- args) (assq ?- args)) | |
532 | (setq args (nconc (rx-any-delete-from-range ?- args) | |
533 | (list ?-))))) | |
534 | ;; {no close bracket appears} | |
535 | ;; | |
536 | ;; bring single bar to the beginning | |
537 | ((memq ?- args) | |
538 | (setq args (cons ?- (delq ?- args)))) | |
539 | ;; bar start a range, bring it to the beginning | |
540 | ((setq m (assq ?- args)) | |
541 | (setq args (cons m (delq m args)))) | |
542 | ;; | |
543 | ;; hat at the beginning? | |
544 | ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^)) | |
545 | (setq args (if (cdr args) | |
546 | `(,(cadr args) ,(car args) ,@(cddr args)) | |
547 | (nconc (rx-any-delete-from-range ?^ args) | |
548 | (list ?^)))))) | |
549 | ;; some 1-char? | |
550 | (if (and (null (cdr args)) (numberp (car args)) | |
551 | (or (= 1 (length | |
552 | (setq s (regexp-quote (string (car args)))))) | |
553 | (and (equal (car args) ?^) ;; unnecessary predicate? | |
554 | (null (eq rx-parent '!))))) | |
555 | s | |
556 | (concat "[" | |
557 | (mapconcat | |
558 | (lambda (e) (cond | |
559 | ((numberp e) (string e)) | |
560 | ((consp e) | |
561 | (if (and (= (1+ (car e)) (cdr e)) | |
19fe13df SM |
562 | ;; rx-any-condense-range should |
563 | ;; prevent this case from happening. | |
564 | (null (memq (car e) '(?\] ?-))) | |
565 | (null (memq (cdr e) '(?\] ?-)))) | |
5dbe5c8f CY |
566 | (string (car e) (cdr e)) |
567 | (string (car e) ?- (cdr e)))) | |
568 | (e))) | |
569 | args | |
570 | nil) | |
571 | "]")))) | |
12c64503 GM |
572 | |
573 | ||
740b7c2d EZ |
574 | (defun rx-check-not (arg) |
575 | "Check arg ARG for Rx `not'." | |
ccfbe679 | 576 | (unless (or (and (symbolp arg) |
5dbe5c8f | 577 | (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" |
ccfbe679 | 578 | (condition-case nil |
5dbe5c8f | 579 | (rx-form arg) |
ccfbe679 | 580 | (error "")))) |
1f6f7722 | 581 | (eq arg 'word-boundary) |
ccfbe679 SM |
582 | (and (consp arg) |
583 | (memq (car arg) '(not any in syntax category)))) | |
584 | (error "rx `not' syntax error: %s" arg)) | |
585 | t) | |
12c64503 GM |
586 | |
587 | ||
588 | (defun rx-not (form) | |
589 | "Parse and produce code from FORM. FORM is `(not ...)'." | |
590 | (rx-check form) | |
5dbe5c8f | 591 | (let ((result (rx-form (cadr form) '!)) |
062a9fce | 592 | case-fold-search) |
12c64503 | 593 | (cond ((string-match "\\`\\[^" result) |
5dbe5c8f CY |
594 | (cond |
595 | ((equal result "[^]") "[^^]") | |
596 | ((and (= (length result) 4) (null (eq rx-parent '!))) | |
597 | (regexp-quote (substring result 2 3))) | |
598 | ((concat "[" (substring result 2))))) | |
ccfbe679 | 599 | ((eq ?\[ (aref result 0)) |
12c64503 | 600 | (concat "[^" (substring result 1))) |
5dbe5c8f CY |
601 | ((string-match "\\`\\\\[scbw]" result) |
602 | (concat (upcase (substring result 0 2)) | |
603 | (substring result 2))) | |
604 | ((string-match "\\`\\\\[SCBW]" result) | |
605 | (concat (downcase (substring result 0 2)) | |
606 | (substring result 2))) | |
12c64503 GM |
607 | (t |
608 | (concat "[^" result "]"))))) | |
609 | ||
610 | ||
ccfbe679 SM |
611 | (defun rx-not-char (form) |
612 | "Parse and produce code from FORM. FORM is `(not-char ...)'." | |
613 | (rx-check form) | |
614 | (rx-not `(not (in ,@(cdr form))))) | |
615 | ||
616 | ||
617 | (defun rx-not-syntax (form) | |
618 | "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'." | |
619 | (rx-check form) | |
620 | (rx-not `(not (syntax ,@(cdr form))))) | |
621 | ||
622 | ||
623 | (defun rx-trans-forms (form &optional skip) | |
624 | "If FORM's length is greater than two, transform it to length two. | |
625 | A form (HEAD REST ...) becomes (HEAD (and REST ...)). | |
626 | If SKIP is non-nil, allow that number of items after the head, i.e. | |
627 | `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1." | |
628 | (unless skip (setq skip 0)) | |
629 | (let ((tail (nthcdr (1+ skip) form))) | |
630 | (if (= (length tail) 1) | |
631 | form | |
632 | (let ((form (copy-sequence form))) | |
633 | (setcdr (nthcdr skip form) (list (cons 'and tail))) | |
634 | form)))) | |
635 | ||
636 | ||
637 | (defun rx-= (form) | |
638 | "Parse and produce code from FORM `(= N ...)'." | |
639 | (rx-check form) | |
640 | (setq form (rx-trans-forms form 1)) | |
641 | (unless (and (integerp (nth 1 form)) | |
642 | (> (nth 1 form) 0)) | |
643 | (error "rx `=' requires positive integer first arg")) | |
5dbe5c8f | 644 | (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form))) |
ccfbe679 SM |
645 | |
646 | ||
647 | (defun rx->= (form) | |
648 | "Parse and produce code from FORM `(>= N ...)'." | |
649 | (rx-check form) | |
650 | (setq form (rx-trans-forms form 1)) | |
651 | (unless (and (integerp (nth 1 form)) | |
652 | (> (nth 1 form) 0)) | |
653 | (error "rx `>=' requires positive integer first arg")) | |
5dbe5c8f | 654 | (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form))) |
ccfbe679 SM |
655 | |
656 | ||
657 | (defun rx-** (form) | |
658 | "Parse and produce code from FORM `(** N M ...)'." | |
659 | (rx-check form) | |
723ee192 | 660 | (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*)) |
ccfbe679 SM |
661 | |
662 | ||
12c64503 GM |
663 | (defun rx-repeat (form) |
664 | "Parse and produce code from FORM. | |
723ee192 | 665 | FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'." |
12c64503 | 666 | (rx-check form) |
723ee192 SM |
667 | (if (> (length form) 4) |
668 | (setq form (rx-trans-forms form 2))) | |
669 | (if (null (nth 2 form)) | |
1f6f7722 | 670 | (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form))))) |
12c64503 GM |
671 | (cond ((= (length form) 3) |
672 | (unless (and (integerp (nth 1 form)) | |
673 | (> (nth 1 form) 0)) | |
740b7c2d | 674 | (error "rx `repeat' requires positive integer first arg")) |
5dbe5c8f | 675 | (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form))) |
12c64503 GM |
676 | ((or (not (integerp (nth 2 form))) |
677 | (< (nth 2 form) 0) | |
678 | (not (integerp (nth 1 form))) | |
679 | (< (nth 1 form) 0) | |
680 | (< (nth 2 form) (nth 1 form))) | |
740b7c2d | 681 | (error "rx `repeat' range error")) |
12c64503 | 682 | (t |
5dbe5c8f | 683 | (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*) |
12c64503 GM |
684 | (nth 1 form) (nth 2 form))))) |
685 | ||
686 | ||
687 | (defun rx-submatch (form) | |
688 | "Parse and produce code from FORM, which is `(submatch ...)'." | |
e9e9c7b8 SM |
689 | (concat "\\(" |
690 | (if (= 2 (length form)) | |
691 | ;; Only one sub-form. | |
692 | (rx-form (cadr form)) | |
693 | ;; Several sub-forms implicitly concatenated. | |
694 | (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil)) | |
695 | "\\)")) | |
5dbe5c8f | 696 | |
6420d28b CY |
697 | (defun rx-submatch-n (form) |
698 | "Parse and produce code from FORM, which is `(submatch-n N ...)'." | |
699 | (let ((n (nth 1 form))) | |
700 | (concat "\\(?" (number-to-string n) ":" | |
701 | (if (= 3 (length form)) | |
702 | ;; Only one sub-form. | |
703 | (rx-form (nth 2 form)) | |
704 | ;; Several sub-forms implicitly concatenated. | |
705 | (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil)) | |
706 | "\\)"))) | |
12c64503 | 707 | |
740b7c2d EZ |
708 | (defun rx-backref (form) |
709 | "Parse and produce code from FORM, which is `(backref N)'." | |
710 | (rx-check form) | |
711 | (format "\\%d" (nth 1 form))) | |
712 | ||
713 | (defun rx-check-backref (arg) | |
714 | "Check arg ARG for Rx `backref'." | |
715 | (or (and (integerp arg) (>= arg 1) (<= arg 9)) | |
716 | (error "rx `backref' requires numeric 1<=arg<=9: %s" arg))) | |
717 | ||
12c64503 GM |
718 | (defun rx-kleene (form) |
719 | "Parse and produce code from FORM. | |
720 | FORM is `(OP FORM1)', where OP is one of the `zero-or-one', | |
a1506d29 | 721 | `zero-or-more' etc. operators. |
12c64503 GM |
722 | If OP is one of `*', `+', `?', produce a greedy regexp. |
723 | If OP is one of `*?', `+?', `??', produce a non-greedy regexp. | |
724 | If OP is anything else, produce a greedy regexp if `rx-greedy-flag' | |
725 | is non-nil." | |
726 | (rx-check form) | |
ccfbe679 | 727 | (setq form (rx-trans-forms form)) |
5dbe5c8f | 728 | (let ((suffix (cond ((memq (car form) '(* + ?\s)) "") |
12c64503 GM |
729 | ((memq (car form) '(*? +? ??)) "?") |
730 | (rx-greedy-flag "") | |
731 | (t "?"))) | |
732 | (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") | |
733 | ((memq (car form) '(+ +? 1+ one-or-more)) "+") | |
5dbe5c8f CY |
734 | (t "?")))) |
735 | (rx-group-if | |
736 | (concat (rx-form (cadr form) '*) op suffix) | |
737 | (and (memq rx-parent '(t *)) rx-parent)))) | |
c53f9b3b | 738 | |
5dbe5c8f CY |
739 | |
740 | (defun rx-atomic-p (r &optional lax) | |
c53f9b3b RS |
741 | "Return non-nil if regexp string R is atomic. |
742 | An atomic regexp R is one such that a suffix operator | |
743 | appended to R will apply to all of R. For example, \"a\" | |
744 | \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\", | |
745 | \"[ab]c\", and \"ab\\|ab*c\" are not atomic. | |
746 | ||
747 | This function may return false negatives, but it will not | |
748 | return false positives. It is nevertheless useful in | |
ab2d877d | 749 | situations where an efficiency shortcut can be taken only if a |
c53f9b3b RS |
750 | regexp is atomic. The function can be improved to detect |
751 | more cases of atomic regexps. Presently, this function | |
752 | detects the following categories of atomic regexp; | |
753 | ||
754 | a group or shy group: \\(...\\) | |
755 | a character class: [...] | |
756 | a single character: a | |
757 | ||
758 | On the other hand, false negatives will be returned for | |
759 | regexps that are atomic but end in operators, such as | |
760 | \"a+\". I think these are rare. Probably such cases could | |
761 | be detected without much effort. A guarantee of no false | |
762 | negatives would require a theoretic specification of the set | |
763 | of all atomic regexps." | |
764 | (let ((l (length r))) | |
5dbe5c8f CY |
765 | (cond |
766 | ((<= l 1)) | |
767 | ((= l 2) (= (aref r 0) ?\\)) | |
768 | ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r)) | |
769 | ((null lax) | |
770 | (cond | |
771 | ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r)) | |
772 | ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r))))))) | |
12c64503 GM |
773 | |
774 | ||
775 | (defun rx-syntax (form) | |
776 | "Parse and produce code from FORM, which is `(syntax SYMBOL)'." | |
777 | (rx-check form) | |
09c774f7 | 778 | (let* ((sym (cadr form)) |
723ee192 | 779 | (syntax (cdr (assq sym rx-syntax)))) |
12c64503 | 780 | (unless syntax |
09c774f7 | 781 | ;; Try sregex compatibility. |
723ee192 | 782 | (cond |
8ee2dc5c | 783 | ((characterp sym) (setq syntax sym)) |
723ee192 SM |
784 | ((symbolp sym) |
785 | (let ((name (symbol-name sym))) | |
786 | (if (= 1 (length name)) | |
787 | (setq syntax (aref name 0)))))) | |
09c774f7 | 788 | (unless syntax |
723ee192 SM |
789 | (error "Unknown rx syntax `%s'" sym))) |
790 | (format "\\s%c" syntax))) | |
12c64503 GM |
791 | |
792 | ||
793 | (defun rx-check-category (form) | |
794 | "Check the argument FORM of a `(category FORM)'." | |
795 | (unless (or (integerp form) | |
796 | (cdr (assq form rx-categories))) | |
797 | (error "Unknown category `%s'" form)) | |
798 | t) | |
a1506d29 | 799 | |
12c64503 GM |
800 | |
801 | (defun rx-category (form) | |
ccfbe679 | 802 | "Parse and produce code from FORM, which is `(category SYMBOL)'." |
12c64503 GM |
803 | (rx-check form) |
804 | (let ((char (if (integerp (cadr form)) | |
805 | (cadr form) | |
806 | (cdr (assq (cadr form) rx-categories))))) | |
807 | (format "\\c%c" char))) | |
808 | ||
809 | ||
810 | (defun rx-eval (form) | |
811 | "Parse and produce code from FORM, which is `(eval FORM)'." | |
812 | (rx-check form) | |
5dbe5c8f | 813 | (rx-form (eval (cadr form)) rx-parent)) |
12c64503 GM |
814 | |
815 | ||
816 | (defun rx-greedy (form) | |
740b7c2d EZ |
817 | "Parse and produce code from FORM. |
818 | If FORM is '(minimal-match FORM1)', non-greedy versions of `*', | |
819 | `+', and `?' operators will be used in FORM1. If FORM is | |
820 | '(maximal-match FORM1)', greedy operators will be used." | |
12c64503 GM |
821 | (rx-check form) |
822 | (let ((rx-greedy-flag (eq (car form) 'maximal-match))) | |
5dbe5c8f | 823 | (rx-form (cadr form) rx-parent))) |
12c64503 GM |
824 | |
825 | ||
826 | (defun rx-regexp (form) | |
827 | "Parse and produce code from FORM, which is `(regexp STRING)'." | |
828 | (rx-check form) | |
5dbe5c8f CY |
829 | (rx-group-if (cadr form) rx-parent)) |
830 | ||
831 | ||
832 | (defun rx-form (form &optional rx-parent) | |
833 | "Parse and produce code for regular expression FORM. | |
834 | FORM is a regular expression in sexp form. | |
835 | RX-PARENT shows which type of expression calls and controls putting of | |
836 | shy groups around the result and some more in other functions." | |
daa9f1a6 SM |
837 | (cond |
838 | ((stringp form) | |
839 | (rx-group-if (regexp-quote form) | |
840 | (if (and (eq rx-parent '*) (< 1 (length form))) | |
841 | rx-parent))) | |
842 | ((integerp form) | |
843 | (regexp-quote (char-to-string form))) | |
844 | ((symbolp form) | |
845 | (let ((info (rx-info form nil))) | |
846 | (cond ((stringp info) | |
847 | info) | |
848 | ((null info) | |
849 | (error "Unknown rx form `%s'" form)) | |
850 | (t | |
851 | (funcall (nth 0 info) form))))) | |
852 | ((consp form) | |
853 | (let ((info (rx-info (car form) 'head))) | |
854 | (unless (consp info) | |
855 | (error "Unknown rx form `%s'" (car form))) | |
856 | (funcall (nth 0 info) form))) | |
857 | (t | |
858 | (error "rx syntax error at `%s'" form)))) | |
12c64503 GM |
859 | |
860 | ||
861 | ;;;###autoload | |
862 | (defun rx-to-string (form &optional no-group) | |
863 | "Parse and produce code for regular expression FORM. | |
864 | FORM is a regular expression in sexp form. | |
865 | NO-GROUP non-nil means don't put shy groups around the result." | |
5dbe5c8f | 866 | (rx-group-if (rx-form form) (null no-group))) |
12c64503 GM |
867 | |
868 | ||
869 | ;;;###autoload | |
ccfbe679 SM |
870 | (defmacro rx (&rest regexps) |
871 | "Translate regular expressions REGEXPS in sexp form to a regexp string. | |
872 | REGEXPS is a non-empty sequence of forms of the sort listed below. | |
baac7510 CY |
873 | |
874 | Note that `rx' is a Lisp macro; when used in a Lisp program being | |
1139b0be | 875 | compiled, the translation is performed by the compiler. |
baac7510 | 876 | See `rx-to-string' for how to do such a translation at run-time. |
12c64503 GM |
877 | |
878 | The following are valid subforms of regular expressions in sexp | |
879 | notation. | |
880 | ||
881 | STRING | |
882 | matches string STRING literally. | |
883 | ||
884 | CHAR | |
885 | matches character CHAR literally. | |
886 | ||
ccfbe679 | 887 | `not-newline', `nonl' |
12c64503 | 888 | matches any character except a newline. |
e8449cdb | 889 | |
12c64503 GM |
890 | `anything' |
891 | matches any character | |
892 | ||
ccfbe679 SM |
893 | `(any SET ...)' |
894 | `(in SET ...)' | |
895 | `(char SET ...)' | |
896 | matches any character in SET .... SET may be a character or string. | |
12c64503 | 897 | Ranges of characters can be specified as `A-Z' in strings. |
ccfbe679 | 898 | Ranges may also be specified as conses like `(?A . ?Z)'. |
12c64503 | 899 | |
ccfbe679 SM |
900 | SET may also be the name of a character class: `digit', |
901 | `control', `hex-digit', `blank', `graph', `print', `alnum', | |
902 | `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper', | |
903 | `word', or one of their synonyms. | |
12c64503 | 904 | |
ccfbe679 SM |
905 | `(not (any SET ...))' |
906 | matches any character not in SET ... | |
12c64503 | 907 | |
ccfbe679 | 908 | `line-start', `bol' |
12c64503 GM |
909 | matches the empty string, but only at the beginning of a line |
910 | in the text being matched | |
911 | ||
ccfbe679 | 912 | `line-end', `eol' |
12c64503 GM |
913 | is similar to `line-start' but matches only at the end of a line |
914 | ||
ccfbe679 | 915 | `string-start', `bos', `bot' |
12c64503 GM |
916 | matches the empty string, but only at the beginning of the |
917 | string being matched against. | |
918 | ||
ccfbe679 | 919 | `string-end', `eos', `eot' |
12c64503 GM |
920 | matches the empty string, but only at the end of the |
921 | string being matched against. | |
922 | ||
923 | `buffer-start' | |
924 | matches the empty string, but only at the beginning of the | |
ccfbe679 | 925 | buffer being matched against. Actually equivalent to `string-start'. |
12c64503 GM |
926 | |
927 | `buffer-end' | |
928 | matches the empty string, but only at the end of the | |
ccfbe679 | 929 | buffer being matched against. Actually equivalent to `string-end'. |
12c64503 GM |
930 | |
931 | `point' | |
932 | matches the empty string, but only at point. | |
933 | ||
ccfbe679 | 934 | `word-start', `bow' |
5e3fc9eb | 935 | matches the empty string, but only at the beginning of a word. |
12c64503 | 936 | |
ccfbe679 | 937 | `word-end', `eow' |
12c64503 GM |
938 | matches the empty string, but only at the end of a word. |
939 | ||
940 | `word-boundary' | |
941 | matches the empty string, but only at the beginning or end of a | |
942 | word. | |
943 | ||
944 | `(not word-boundary)' | |
ccfbe679 | 945 | `not-word-boundary' |
12c64503 GM |
946 | matches the empty string, but not at the beginning or end of a |
947 | word. | |
948 | ||
5e3fc9eb GM |
949 | `symbol-start' |
950 | matches the empty string, but only at the beginning of a symbol. | |
951 | ||
952 | `symbol-end' | |
953 | matches the empty string, but only at the end of a symbol. | |
954 | ||
ccfbe679 | 955 | `digit', `numeric', `num' |
12c64503 GM |
956 | matches 0 through 9. |
957 | ||
ccfbe679 | 958 | `control', `cntrl' |
12c64503 GM |
959 | matches ASCII control characters. |
960 | ||
ccfbe679 | 961 | `hex-digit', `hex', `xdigit' |
12c64503 GM |
962 | matches 0 through 9, a through f and A through F. |
963 | ||
964 | `blank' | |
965 | matches space and tab only. | |
966 | ||
ccfbe679 | 967 | `graphic', `graph' |
12c64503 GM |
968 | matches graphic characters--everything except ASCII control chars, |
969 | space, and DEL. | |
970 | ||
ccfbe679 | 971 | `printing', `print' |
12c64503 GM |
972 | matches printing characters--everything except ASCII control chars |
973 | and DEL. | |
974 | ||
ccfbe679 | 975 | `alphanumeric', `alnum' |
12c64503 GM |
976 | matches letters and digits. (But at present, for multibyte characters, |
977 | it matches anything that has word syntax.) | |
978 | ||
ccfbe679 | 979 | `letter', `alphabetic', `alpha' |
12c64503 GM |
980 | matches letters. (But at present, for multibyte characters, |
981 | it matches anything that has word syntax.) | |
982 | ||
983 | `ascii' | |
984 | matches ASCII (unibyte) characters. | |
985 | ||
986 | `nonascii' | |
987 | matches non-ASCII (multibyte) characters. | |
988 | ||
ccfbe679 | 989 | `lower', `lower-case' |
12c64503 GM |
990 | matches anything lower-case. |
991 | ||
ccfbe679 | 992 | `upper', `upper-case' |
12c64503 GM |
993 | matches anything upper-case. |
994 | ||
ccfbe679 | 995 | `punctuation', `punct' |
12c64503 GM |
996 | matches punctuation. (But at present, for multibyte characters, |
997 | it matches anything that has non-word syntax.) | |
998 | ||
ccfbe679 | 999 | `space', `whitespace', `white' |
12c64503 GM |
1000 | matches anything that has whitespace syntax. |
1001 | ||
ccfbe679 | 1002 | `word', `wordchar' |
12c64503 GM |
1003 | matches anything that has word syntax. |
1004 | ||
ccfbe679 SM |
1005 | `not-wordchar' |
1006 | matches anything that has non-word syntax. | |
1007 | ||
12c64503 GM |
1008 | `(syntax SYNTAX)' |
1009 | matches a character with syntax SYNTAX. SYNTAX must be one | |
ccfbe679 SM |
1010 | of the following symbols, or a symbol corresponding to the syntax |
1011 | character, e.g. `\\.' for `\\s.'. | |
12c64503 GM |
1012 | |
1013 | `whitespace' (\\s- in string notation) | |
1014 | `punctuation' (\\s.) | |
1015 | `word' (\\sw) | |
1016 | `symbol' (\\s_) | |
1017 | `open-parenthesis' (\\s() | |
1018 | `close-parenthesis' (\\s)) | |
1019 | `expression-prefix' (\\s') | |
1020 | `string-quote' (\\s\") | |
1021 | `paired-delimiter' (\\s$) | |
1022 | `escape' (\\s\\) | |
1023 | `character-quote' (\\s/) | |
1024 | `comment-start' (\\s<) | |
1025 | `comment-end' (\\s>) | |
740b7c2d EZ |
1026 | `string-delimiter' (\\s|) |
1027 | `comment-delimiter' (\\s!) | |
12c64503 GM |
1028 | |
1029 | `(not (syntax SYNTAX))' | |
ccfbe679 | 1030 | matches a character that doesn't have syntax SYNTAX. |
12c64503 GM |
1031 | |
1032 | `(category CATEGORY)' | |
1033 | matches a character with category CATEGORY. CATEGORY must be | |
1034 | either a character to use for C, or one of the following symbols. | |
1035 | ||
1036 | `consonant' (\\c0 in string notation) | |
1037 | `base-vowel' (\\c1) | |
1038 | `upper-diacritical-mark' (\\c2) | |
1039 | `lower-diacritical-mark' (\\c3) | |
1040 | `tone-mark' (\\c4) | |
1041 | `symbol' (\\c5) | |
1042 | `digit' (\\c6) | |
1043 | `vowel-modifying-diacritical-mark' (\\c7) | |
1044 | `vowel-sign' (\\c8) | |
1045 | `semivowel-lower' (\\c9) | |
1046 | `not-at-end-of-line' (\\c<) | |
1047 | `not-at-beginning-of-line' (\\c>) | |
1048 | `alpha-numeric-two-byte' (\\cA) | |
73dbf960 | 1049 | `chinese-two-byte' (\\cC) |
12c64503 GM |
1050 | `greek-two-byte' (\\cG) |
1051 | `japanese-hiragana-two-byte' (\\cH) | |
1052 | `indian-tow-byte' (\\cI) | |
1053 | `japanese-katakana-two-byte' (\\cK) | |
1054 | `korean-hangul-two-byte' (\\cN) | |
1055 | `cyrillic-two-byte' (\\cY) | |
ccfbe679 | 1056 | `combining-diacritic' (\\c^) |
12c64503 GM |
1057 | `ascii' (\\ca) |
1058 | `arabic' (\\cb) | |
1059 | `chinese' (\\cc) | |
1060 | `ethiopic' (\\ce) | |
1061 | `greek' (\\cg) | |
1062 | `korean' (\\ch) | |
1063 | `indian' (\\ci) | |
1064 | `japanese' (\\cj) | |
1065 | `japanese-katakana' (\\ck) | |
1066 | `latin' (\\cl) | |
1067 | `lao' (\\co) | |
1068 | `tibetan' (\\cq) | |
1069 | `japanese-roman' (\\cr) | |
1070 | `thai' (\\ct) | |
1071 | `vietnamese' (\\cv) | |
1072 | `hebrew' (\\cw) | |
1073 | `cyrillic' (\\cy) | |
1074 | `can-break' (\\c|) | |
1075 | ||
1076 | `(not (category CATEGORY))' | |
ccfbe679 | 1077 | matches a character that doesn't have category CATEGORY. |
12c64503 GM |
1078 | |
1079 | `(and SEXP1 SEXP2 ...)' | |
ccfbe679 SM |
1080 | `(: SEXP1 SEXP2 ...)' |
1081 | `(seq SEXP1 SEXP2 ...)' | |
1082 | `(sequence SEXP1 SEXP2 ...)' | |
12c64503 GM |
1083 | matches what SEXP1 matches, followed by what SEXP2 matches, etc. |
1084 | ||
1085 | `(submatch SEXP1 SEXP2 ...)' | |
ccfbe679 | 1086 | `(group SEXP1 SEXP2 ...)' |
12c64503 GM |
1087 | like `and', but makes the match accessible with `match-end', |
1088 | `match-beginning', and `match-string'. | |
1089 | ||
6420d28b CY |
1090 | `(submatch-n N SEXP1 SEXP2 ...)' |
1091 | `(group-n N SEXP1 SEXP2 ...)' | |
1092 | like `group', but make it an explicitly-numbered group with | |
1093 | group number N. | |
1094 | ||
12c64503 | 1095 | `(or SEXP1 SEXP2 ...)' |
ccfbe679 | 1096 | `(| SEXP1 SEXP2 ...)' |
12c64503 GM |
1097 | matches anything that matches SEXP1 or SEXP2, etc. If all |
1098 | args are strings, use `regexp-opt' to optimize the resulting | |
1099 | regular expression. | |
1100 | ||
1101 | `(minimal-match SEXP)' | |
1102 | produce a non-greedy regexp for SEXP. Normally, regexps matching | |
740b7c2d | 1103 | zero or more occurrences of something are \"greedy\" in that they |
12c64503 GM |
1104 | match as much as they can, as long as the overall regexp can |
1105 | still match. A non-greedy regexp matches as little as possible. | |
1106 | ||
1107 | `(maximal-match SEXP)' | |
0a6cac62 | 1108 | produce a greedy regexp for SEXP. This is the default. |
12c64503 | 1109 | |
ccfbe679 SM |
1110 | Below, `SEXP ...' represents a sequence of regexp forms, treated as if |
1111 | enclosed in `(and ...)'. | |
12c64503 | 1112 | |
ccfbe679 SM |
1113 | `(zero-or-more SEXP ...)' |
1114 | `(0+ SEXP ...)' | |
1115 | matches zero or more occurrences of what SEXP ... matches. | |
12c64503 | 1116 | |
ccfbe679 SM |
1117 | `(* SEXP ...)' |
1118 | like `zero-or-more', but always produces a greedy regexp, independent | |
1119 | of `rx-greedy-flag'. | |
12c64503 | 1120 | |
ccfbe679 SM |
1121 | `(*? SEXP ...)' |
1122 | like `zero-or-more', but always produces a non-greedy regexp, | |
1123 | independent of `rx-greedy-flag'. | |
a1506d29 | 1124 | |
ccfbe679 SM |
1125 | `(one-or-more SEXP ...)' |
1126 | `(1+ SEXP ...)' | |
1127 | matches one or more occurrences of SEXP ... | |
12c64503 | 1128 | |
ccfbe679 | 1129 | `(+ SEXP ...)' |
12c64503 GM |
1130 | like `one-or-more', but always produces a greedy regexp. |
1131 | ||
ccfbe679 | 1132 | `(+? SEXP ...)' |
12c64503 GM |
1133 | like `one-or-more', but always produces a non-greedy regexp. |
1134 | ||
ccfbe679 SM |
1135 | `(zero-or-one SEXP ...)' |
1136 | `(optional SEXP ...)' | |
1137 | `(opt SEXP ...)' | |
12c64503 | 1138 | matches zero or one occurrences of A. |
a1506d29 | 1139 | |
ccfbe679 | 1140 | `(? SEXP ...)' |
12c64503 GM |
1141 | like `zero-or-one', but always produces a greedy regexp. |
1142 | ||
ccfbe679 | 1143 | `(?? SEXP ...)' |
12c64503 GM |
1144 | like `zero-or-one', but always produces a non-greedy regexp. |
1145 | ||
1146 | `(repeat N SEXP)' | |
ccfbe679 SM |
1147 | `(= N SEXP ...)' |
1148 | matches N occurrences. | |
1149 | ||
1150 | `(>= N SEXP ...)' | |
1151 | matches N or more occurrences. | |
12c64503 GM |
1152 | |
1153 | `(repeat N M SEXP)' | |
ccfbe679 SM |
1154 | `(** N M SEXP ...)' |
1155 | matches N to M occurrences. | |
1156 | ||
942269e7 JB |
1157 | `(backref N)' |
1158 | matches what was matched previously by submatch N. | |
1159 | ||
12c64503 | 1160 | `(eval FORM)' |
942269e7 JB |
1161 | evaluate FORM and insert result. If result is a string, |
1162 | `regexp-quote' it. | |
12c64503 GM |
1163 | |
1164 | `(regexp REGEXP)' | |
942269e7 | 1165 | include REGEXP in string notation in the result." |
ccfbe679 SM |
1166 | (cond ((null regexps) |
1167 | (error "No regexp")) | |
1168 | ((cdr regexps) | |
1169 | (rx-to-string `(and ,@regexps) t)) | |
1170 | (t | |
1171 | (rx-to-string (car regexps) t)))) | |
1172 | \f | |
1173 | ;; ;; sregex.el replacement | |
1174 | ||
1175 | ;; ;;;###autoload (provide 'sregex) | |
1176 | ;; ;;;###autoload (autoload 'sregex "rx") | |
1177 | ;; (defalias 'sregex 'rx-to-string) | |
1178 | ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro) | |
1179 | ;; (defalias 'sregexq 'rx) | |
1180 | \f | |
12c64503 GM |
1181 | (provide 'rx) |
1182 | ||
1183 | ;;; rx.el ends here |