Commit | Line | Data |
---|---|---|
12c64503 GM |
1 | ;;; rx.el --- sexp notation for regular expressions |
2 | ||
6b61353c | 3 | ;; Copyright (C) 2001, 2003, 2004 Free Software Foundation, Inc. |
12c64503 GM |
4 | |
5 | ;; Author: Gerd Moellmann <gerd@gnu.org> | |
6 | ;; Maintainer: FSF | |
7 | ;; Keywords: strings, regexps, extensions | |
8 | ||
9 | ;; This file is part of GNU Emacs. | |
10 | ||
11 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
12 | ;; it under the terms of the GNU General Public License as published by | |
13 | ;; the Free Software Foundation; either version 2, or (at your option) | |
14 | ;; any later version. | |
15 | ||
16 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
17 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | ;; GNU General Public License for more details. | |
20 | ||
21 | ;; You should have received a copy of the GNU General Public License | |
22 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
23 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
24 | ;; Boston, MA 02111-1307, USA. | |
25 | ||
26 | ;;; Commentary: | |
27 | ||
28 | ;; This is another implementation of sexp-form regular expressions. | |
29 | ;; It was unfortunately written without being aware of the Sregex | |
30 | ;; package coming with Emacs, but as things stand, Rx completely | |
31 | ;; covers all regexp features, which Sregex doesn't, doesn't suffer | |
32 | ;; from the bugs mentioned in the commentary section of Sregex, and | |
33 | ;; uses a nicer syntax (IMHO, of course :-). | |
34 | ||
35 | ;; Rx translates a sexp notation for regular expressions into the | |
36 | ;; usual string notation. The translation can be done at compile-time | |
37 | ;; by using the `rx' macro. It can be done at run-time by calling | |
38 | ;; function `rx-to-string'. See the documentation of `rx' for a | |
39 | ;; complete description of the sexp notation. | |
40 | ;; | |
41 | ;; Some examples of string regexps and their sexp counterparts: | |
42 | ;; | |
43 | ;; "^[a-z]*" | |
44 | ;; (rx (and line-start (0+ (in "a-z")))) | |
45 | ;; | |
46 | ;; "\n[^ \t]" | |
47 | ;; (rx (and "\n" (not blank))), or | |
48 | ;; (rx (and "\n" (not (any " \t")))) | |
49 | ;; | |
50 | ;; "\\*\\*\\* EOOH \\*\\*\\*\n" | |
51 | ;; (rx "*** EOOH ***\n") | |
52 | ;; | |
53 | ;; "\\<\\(catch\\|finally\\)\\>[^_]" | |
54 | ;; (rx (and word-start (submatch (or "catch" "finally")) word-end | |
55 | ;; (not (any ?_)))) | |
56 | ;; | |
57 | ;; "[ \t\n]*:\\([^:]+\\|$\\)" | |
58 | ;; (rx (and (zero-or-more (in " \t\n")) ":" | |
59 | ;; (submatch (or line-end (one-or-more (not (any ?:))))))) | |
60 | ;; | |
61 | ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*" | |
62 | ;; (rx (and line-start | |
63 | ;; "content-transfer-encoding:" | |
c53f9b3b | 64 | ;; (+ (? ?\n)) blank |
12c64503 | 65 | ;; "quoted-printable" |
c53f9b3b | 66 | ;; (+ (? ?\n)) blank)) |
12c64503 GM |
67 | ;; |
68 | ;; (concat "^\\(?:" something-else "\\)") | |
69 | ;; (rx (and line-start (eval something-else))), statically or | |
70 | ;; (rx-to-string '(and line-start ,something-else)), dynamically. | |
71 | ;; | |
72 | ;; (regexp-opt '(STRING1 STRING2 ...)) | |
73 | ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically | |
74 | ;; calls `regexp-opt' as needed. | |
75 | ;; | |
76 | ;; "^;;\\s-*\n\\|^\n" | |
77 | ;; (rx (or (and line-start ";;" (0+ space) ?\n) | |
78 | ;; (and line-start ?\n))) | |
79 | ;; | |
80 | ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " | |
a1506d29 JB |
81 | ;; (rx (and "$Id: " |
82 | ;; (1+ (not (in " "))) | |
12c64503 GM |
83 | ;; " " |
84 | ;; (submatch (1+ (not (in " ")))) | |
c53f9b3b | 85 | ;; " ")) |
12c64503 GM |
86 | ;; |
87 | ;; "\\\\\\\\\\[\\w+" | |
88 | ;; (rx (and ?\\ ?\\ ?\[ (1+ word))) | |
89 | ;; | |
90 | ;; etc. | |
91 | ||
92 | ;;; History: | |
a1506d29 | 93 | ;; |
12c64503 GM |
94 | |
95 | ;;; Code: | |
96 | ||
97 | ||
98 | (defconst rx-constituents | |
99 | '((and . (rx-and 1 nil)) | |
100 | (or . (rx-or 1 nil)) | |
101 | (not-newline . ".") | |
102 | (anything . ".\\|\n") | |
103 | (any . (rx-any 1 1 rx-check-any)) | |
104 | (in . any) | |
105 | (not . (rx-not 1 1 rx-check-not)) | |
106 | (repeat . (rx-repeat 2 3)) | |
107 | (submatch . (rx-submatch 1 nil)) | |
108 | (group . submatch) | |
109 | (zero-or-more . (rx-kleene 1 1)) | |
110 | (one-or-more . (rx-kleene 1 1)) | |
111 | (zero-or-one . (rx-kleene 1 1)) | |
112 | (\? . zero-or-one) | |
113 | (\?? . zero-or-one) | |
114 | (* . zero-or-more) | |
115 | (*? . zero-or-more) | |
116 | (0+ . zero-or-more) | |
117 | (+ . one-or-more) | |
118 | (+? . one-or-more) | |
119 | (1+ . one-or-more) | |
120 | (optional . zero-or-one) | |
121 | (minimal-match . (rx-greedy 1 1)) | |
122 | (maximal-match . (rx-greedy 1 1)) | |
6b61353c | 123 | (backref . (rx-backref 1 1 rx-check-backref)) |
12c64503 GM |
124 | (line-start . "^") |
125 | (line-end . "$") | |
126 | (string-start . "\\`") | |
127 | (string-end . "\\'") | |
128 | (buffer-start . "\\`") | |
129 | (buffer-end . "\\'") | |
130 | (point . "\\=") | |
131 | (word-start . "\\<") | |
132 | (word-end . "\\>") | |
133 | (word-boundary . "\\b") | |
134 | (syntax . (rx-syntax 1 1)) | |
135 | (category . (rx-category 1 1 rx-check-category)) | |
136 | (eval . (rx-eval 1 1)) | |
137 | (regexp . (rx-regexp 1 1 stringp)) | |
138 | (digit . "[[:digit:]]") | |
139 | (control . "[[:cntrl:]]") | |
140 | (hex-digit . "[[:xdigit:]]") | |
141 | (blank . "[[:blank:]]") | |
142 | (graphic . "[[:graph:]]") | |
143 | (printing . "[[:print:]]") | |
144 | (alphanumeric . "[[:alnum:]]") | |
145 | (letter . "[[:alpha:]]") | |
146 | (ascii . "[[:ascii:]]") | |
147 | (nonascii . "[[:nonascii:]]") | |
148 | (lower . "[[:lower:]]") | |
149 | (punctuation . "[[:punct:]]") | |
150 | (space . "[[:space:]]") | |
151 | (upper . "[[:upper:]]") | |
152 | (word . "[[:word:]]")) | |
153 | "Alist of sexp form regexp constituents. | |
154 | Each element of the alist has the form (SYMBOL . DEFN). | |
155 | SYMBOL is a valid constituent of sexp regular expressions. | |
156 | If DEFN is a string, SYMBOL is translated into DEFN. | |
157 | If DEFN is a symbol, use the definition of DEFN, recursively. | |
158 | Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). | |
159 | FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS | |
160 | are the minimum and maximum number of arguments the function-form | |
161 | sexp constituent SYMBOL may have in sexp regular expressions. | |
162 | MAX-ARGS nil means no limit. PREDICATE, if specified, means that | |
163 | all arguments must satisfy PREDICATE.") | |
164 | ||
165 | ||
166 | (defconst rx-syntax | |
167 | '((whitespace . ?-) | |
168 | (punctuation . ?.) | |
169 | (word . ?w) | |
170 | (symbol . ?_) | |
171 | (open-parenthesis . ?\() | |
172 | (close-parenthesis . ?\)) | |
173 | (expression-prefix . ?\') | |
174 | (string-quote . ?\") | |
175 | (paired-delimiter . ?$) | |
176 | (escape . ?\\) | |
177 | (character-quote . ?/) | |
178 | (comment-start . ?<) | |
6b61353c KH |
179 | (comment-end . ?>) |
180 | (string-delimiter . ?|) | |
181 | (comment-delimiter . ?!)) | |
12c64503 GM |
182 | "Alist mapping Rx syntax symbols to syntax characters. |
183 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | |
184 | symbol in `(syntax SYMBOL)', and CHAR is the syntax character | |
185 | corresponding to SYMBOL, as it would be used with \\s or \\S in | |
186 | regular expressions.") | |
187 | ||
188 | ||
189 | (defconst rx-categories | |
190 | '((consonant . ?0) | |
191 | (base-vowel . ?1) | |
192 | (upper-diacritical-mark . ?2) | |
193 | (lower-diacritical-mark . ?3) | |
194 | (tone-mark . ?4) | |
195 | (symbol . ?5) | |
196 | (digit . ?6) | |
197 | (vowel-modifying-diacritical-mark . ?7) | |
198 | (vowel-sign . ?8) | |
199 | (semivowel-lower . ?9) | |
200 | (not-at-end-of-line . ?<) | |
201 | (not-at-beginning-of-line . ?>) | |
202 | (alpha-numeric-two-byte . ?A) | |
203 | (chinse-two-byte . ?C) | |
204 | (greek-two-byte . ?G) | |
205 | (japanese-hiragana-two-byte . ?H) | |
206 | (indian-two-byte . ?I) | |
207 | (japanese-katakana-two-byte . ?K) | |
208 | (korean-hangul-two-byte . ?N) | |
209 | (cyrillic-two-byte . ?Y) | |
6b61353c | 210 | (combining-diacritic . ?^) |
12c64503 GM |
211 | (ascii . ?a) |
212 | (arabic . ?b) | |
213 | (chinese . ?c) | |
214 | (ethiopic . ?e) | |
215 | (greek . ?g) | |
216 | (korean . ?h) | |
217 | (indian . ?i) | |
218 | (japanese . ?j) | |
219 | (japanese-katakana . ?k) | |
220 | (latin . ?l) | |
221 | (lao . ?o) | |
222 | (tibetan . ?q) | |
223 | (japanese-roman . ?r) | |
224 | (thai . ?t) | |
225 | (vietnamese . ?v) | |
226 | (hebrew . ?w) | |
227 | (cyrillic . ?y) | |
228 | (can-break . ?|)) | |
229 | "Alist mapping symbols to category characters. | |
230 | Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid | |
231 | symbol in `(category SYMBOL)', and CHAR is the category character | |
232 | corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in | |
233 | regular expression strings.") | |
234 | ||
235 | ||
236 | (defvar rx-greedy-flag t | |
237 | "Non-nil means produce greedy regular expressions for `zero-or-one', | |
238 | `zero-or-more', and `one-or-more'. Dynamically bound.") | |
239 | ||
240 | ||
241 | (defun rx-info (op) | |
242 | "Return parsing/code generation info for OP. | |
243 | If OP is the space character ASCII 32, return info for the symbol `?'. | |
244 | If OP is the character `?', return info for the symbol `??'. | |
245 | See also `rx-constituents'." | |
246 | (cond ((eq op ? ) (setq op '\?)) | |
247 | ((eq op ??) (setq op '\??))) | |
248 | (while (and (not (null op)) (symbolp op)) | |
249 | (setq op (cdr (assq op rx-constituents)))) | |
250 | op) | |
a1506d29 | 251 | |
12c64503 GM |
252 | |
253 | (defun rx-check (form) | |
254 | "Check FORM according to its car's parsing info." | |
255 | (let* ((rx (rx-info (car form))) | |
256 | (nargs (1- (length form))) | |
257 | (min-args (nth 1 rx)) | |
258 | (max-args (nth 2 rx)) | |
259 | (type-pred (nth 3 rx))) | |
260 | (when (and (not (null min-args)) | |
261 | (< nargs min-args)) | |
6b61353c | 262 | (error "rx form `%s' requires at least %d args" |
12c64503 GM |
263 | (car form) min-args)) |
264 | (when (and (not (null max-args)) | |
265 | (> nargs max-args)) | |
6b61353c | 266 | (error "rx form `%s' accepts at most %d args" |
12c64503 GM |
267 | (car form) max-args)) |
268 | (when (not (null type-pred)) | |
269 | (dolist (sub-form (cdr form)) | |
270 | (unless (funcall type-pred sub-form) | |
6b61353c | 271 | (error "rx form `%s' requires args satisfying `%s'" |
12c64503 GM |
272 | (car form) type-pred)))))) |
273 | ||
274 | ||
275 | (defun rx-and (form) | |
276 | "Parse and produce code from FORM. | |
277 | FORM is of the form `(and FORM1 ...)'." | |
278 | (rx-check form) | |
c53f9b3b RS |
279 | (concat "\\(?:" |
280 | (mapconcat | |
281 | (function (lambda (x) (rx-to-string x 'no-group))) | |
282 | (cdr form) nil) | |
283 | "\\)")) | |
12c64503 GM |
284 | |
285 | ||
286 | (defun rx-or (form) | |
287 | "Parse and produce code from FORM, which is `(or FORM1 ...)'." | |
288 | (rx-check form) | |
289 | (let ((all-args-strings t)) | |
290 | (dolist (arg (cdr form)) | |
291 | (unless (stringp arg) | |
292 | (setq all-args-strings nil))) | |
6b61353c KH |
293 | (concat "\\(?:" |
294 | (if all-args-strings | |
295 | (regexp-opt (cdr form)) | |
296 | (mapconcat #'rx-to-string (cdr form) "\\|")) | |
297 | "\\)"))) | |
12c64503 GM |
298 | |
299 | ||
300 | (defun rx-quote-for-set (string) | |
301 | "Transform STRING for use in a character set. | |
302 | If STRING contains a `]', move it to the front. | |
303 | If STRING starts with a '^', move it to the end." | |
304 | (when (string-match "\\`\\(\\(?:.\\|\n\\)+\\)\\]\\(\\(?:.\\|\n\\)\\)*\\'" | |
305 | string) | |
306 | (setq string (concat "]" (match-string 1 string) | |
307 | (match-string 2 string)))) | |
308 | (when (string-match "\\`^\\(\\(?:.\\|\n\\)+\\)\\'" string) | |
309 | (setq string (concat (substring string 1) "^"))) | |
310 | string) | |
311 | ||
312 | ||
313 | (defun rx-check-any (arg) | |
314 | "Check arg ARG for Rx `any'." | |
315 | (cond ((integerp arg) t) | |
316 | ((and (stringp arg) (zerop (length arg))) | |
6b61353c | 317 | (error "String arg for rx `any' must not be empty")) |
12c64503 GM |
318 | ((stringp arg) t) |
319 | (t | |
6b61353c | 320 | (error "rx `any' requires string or character arg")))) |
12c64503 GM |
321 | |
322 | ||
323 | (defun rx-any (form) | |
324 | "Parse and produce code from FORM, which is `(any STRING)'. | |
325 | STRING is optional. If it is omitted, build a regexp that | |
326 | matches anything." | |
327 | (rx-check form) | |
328 | (let ((arg (cadr form))) | |
329 | (cond ((integerp arg) | |
330 | (char-to-string arg)) | |
331 | ((= (length arg) 1) | |
332 | arg) | |
333 | (t | |
334 | (concat "[" (rx-quote-for-set (cadr form)) "]"))))) | |
335 | ||
336 | ||
6b61353c KH |
337 | (defun rx-check-not (arg) |
338 | "Check arg ARG for Rx `not'." | |
12c64503 GM |
339 | (unless (or (memq form |
340 | '(digit control hex-digit blank graphic printing | |
341 | alphanumeric letter ascii nonascii lower | |
342 | punctuation space upper word)) | |
343 | (and (consp form) | |
344 | (memq (car form) '(not any in syntax category:)))) | |
6b61353c | 345 | (error "rx `not' syntax error: %s" form)) |
12c64503 GM |
346 | t) |
347 | ||
348 | ||
349 | (defun rx-not (form) | |
350 | "Parse and produce code from FORM. FORM is `(not ...)'." | |
351 | (rx-check form) | |
6b61353c KH |
352 | (let ((result (rx-to-string (cadr form) 'no-group)) |
353 | case-fold-search) | |
12c64503 GM |
354 | (cond ((string-match "\\`\\[^" result) |
355 | (if (= (length result) 4) | |
356 | (substring result 2 3) | |
357 | (concat "[" (substring result 2)))) | |
358 | ((string-match "\\`\\[" result) | |
359 | (concat "[^" (substring result 1))) | |
360 | ((string-match "\\`\\\\s." result) | |
361 | (concat "\\S" (substring result 2))) | |
362 | ((string-match "\\`\\\\S." result) | |
363 | (concat "\\s" (substring result 2))) | |
364 | ((string-match "\\`\\\\c." result) | |
365 | (concat "\\C" (substring result 2))) | |
366 | ((string-match "\\`\\\\C." result) | |
367 | (concat "\\c" (substring result 2))) | |
368 | ((string-match "\\`\\\\B" result) | |
369 | (concat "\\b" (substring result 2))) | |
370 | ((string-match "\\`\\\\b" result) | |
371 | (concat "\\B" (substring result 2))) | |
372 | (t | |
373 | (concat "[^" result "]"))))) | |
374 | ||
375 | ||
376 | (defun rx-repeat (form) | |
377 | "Parse and produce code from FORM. | |
378 | FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." | |
379 | (rx-check form) | |
380 | (cond ((= (length form) 3) | |
381 | (unless (and (integerp (nth 1 form)) | |
382 | (> (nth 1 form) 0)) | |
6b61353c | 383 | (error "rx `repeat' requires positive integer first arg")) |
12c64503 GM |
384 | (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) |
385 | ((or (not (integerp (nth 2 form))) | |
386 | (< (nth 2 form) 0) | |
387 | (not (integerp (nth 1 form))) | |
388 | (< (nth 1 form) 0) | |
389 | (< (nth 2 form) (nth 1 form))) | |
6b61353c | 390 | (error "rx `repeat' range error")) |
12c64503 GM |
391 | (t |
392 | (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form)) | |
393 | (nth 1 form) (nth 2 form))))) | |
394 | ||
395 | ||
396 | (defun rx-submatch (form) | |
397 | "Parse and produce code from FORM, which is `(submatch ...)'." | |
c53f9b3b RS |
398 | (concat "\\(" |
399 | (mapconcat (function (lambda (x) (rx-to-string x 'no-group))) | |
400 | (cdr form) nil) | |
401 | "\\)")) | |
12c64503 | 402 | |
6b61353c KH |
403 | (defun rx-backref (form) |
404 | "Parse and produce code from FORM, which is `(backref N)'." | |
405 | (rx-check form) | |
406 | (format "\\%d" (nth 1 form))) | |
407 | ||
408 | (defun rx-check-backref (arg) | |
409 | "Check arg ARG for Rx `backref'." | |
410 | (or (and (integerp arg) (>= arg 1) (<= arg 9)) | |
411 | (error "rx `backref' requires numeric 1<=arg<=9: %s" arg))) | |
412 | ||
12c64503 GM |
413 | (defun rx-kleene (form) |
414 | "Parse and produce code from FORM. | |
415 | FORM is `(OP FORM1)', where OP is one of the `zero-or-one', | |
a1506d29 | 416 | `zero-or-more' etc. operators. |
12c64503 GM |
417 | If OP is one of `*', `+', `?', produce a greedy regexp. |
418 | If OP is one of `*?', `+?', `??', produce a non-greedy regexp. | |
419 | If OP is anything else, produce a greedy regexp if `rx-greedy-flag' | |
420 | is non-nil." | |
421 | (rx-check form) | |
422 | (let ((suffix (cond ((memq (car form) '(* + ? )) "") | |
423 | ((memq (car form) '(*? +? ??)) "?") | |
424 | (rx-greedy-flag "") | |
425 | (t "?"))) | |
426 | (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") | |
427 | ((memq (car form) '(+ +? 1+ one-or-more)) "+") | |
c53f9b3b RS |
428 | (t "?"))) |
429 | (result (rx-to-string (cadr form) 'no-group))) | |
430 | (if (not (rx-atomic-p result)) | |
431 | (setq result (concat "\\(?:" result "\\)"))) | |
432 | (concat result op suffix))) | |
433 | ||
434 | (defun rx-atomic-p (r) | |
435 | "Return non-nil if regexp string R is atomic. | |
436 | An atomic regexp R is one such that a suffix operator | |
437 | appended to R will apply to all of R. For example, \"a\" | |
438 | \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\", | |
439 | \"[ab]c\", and \"ab\\|ab*c\" are not atomic. | |
440 | ||
441 | This function may return false negatives, but it will not | |
442 | return false positives. It is nevertheless useful in | |
443 | situations where an efficiency shortcut can be taken iff a | |
444 | regexp is atomic. The function can be improved to detect | |
445 | more cases of atomic regexps. Presently, this function | |
446 | detects the following categories of atomic regexp; | |
447 | ||
448 | a group or shy group: \\(...\\) | |
449 | a character class: [...] | |
450 | a single character: a | |
451 | ||
452 | On the other hand, false negatives will be returned for | |
453 | regexps that are atomic but end in operators, such as | |
454 | \"a+\". I think these are rare. Probably such cases could | |
455 | be detected without much effort. A guarantee of no false | |
456 | negatives would require a theoretic specification of the set | |
457 | of all atomic regexps." | |
458 | (let ((l (length r))) | |
459 | (or (equal l 1) | |
460 | (and (>= l 6) | |
461 | (equal (substring r 0 2) "\\(") | |
462 | (equal (substring r -2) "\\)")) | |
463 | (and (>= l 2) | |
464 | (equal (substring r 0 1) "[") | |
465 | (equal (substring r -1) "]"))))) | |
12c64503 GM |
466 | |
467 | ||
468 | (defun rx-syntax (form) | |
469 | "Parse and produce code from FORM, which is `(syntax SYMBOL)'." | |
470 | (rx-check form) | |
471 | (let ((syntax (assq (cadr form) rx-syntax))) | |
472 | (unless syntax | |
473 | (error "Unknown rx syntax `%s'" (cadr form))) | |
474 | (format "\\s%c" (cdr syntax)))) | |
475 | ||
476 | ||
477 | (defun rx-check-category (form) | |
478 | "Check the argument FORM of a `(category FORM)'." | |
479 | (unless (or (integerp form) | |
480 | (cdr (assq form rx-categories))) | |
481 | (error "Unknown category `%s'" form)) | |
482 | t) | |
a1506d29 | 483 | |
12c64503 GM |
484 | |
485 | (defun rx-category (form) | |
486 | "Parse and produce code from FORM, which is `(category SYMBOL ...)'." | |
487 | (rx-check form) | |
488 | (let ((char (if (integerp (cadr form)) | |
489 | (cadr form) | |
490 | (cdr (assq (cadr form) rx-categories))))) | |
491 | (format "\\c%c" char))) | |
492 | ||
493 | ||
494 | (defun rx-eval (form) | |
495 | "Parse and produce code from FORM, which is `(eval FORM)'." | |
496 | (rx-check form) | |
497 | (rx-to-string (eval (cadr form)))) | |
498 | ||
499 | ||
500 | (defun rx-greedy (form) | |
6b61353c KH |
501 | "Parse and produce code from FORM. |
502 | If FORM is '(minimal-match FORM1)', non-greedy versions of `*', | |
503 | `+', and `?' operators will be used in FORM1. If FORM is | |
504 | '(maximal-match FORM1)', greedy operators will be used." | |
12c64503 GM |
505 | (rx-check form) |
506 | (let ((rx-greedy-flag (eq (car form) 'maximal-match))) | |
507 | (rx-to-string (cadr form)))) | |
508 | ||
509 | ||
510 | (defun rx-regexp (form) | |
511 | "Parse and produce code from FORM, which is `(regexp STRING)'." | |
512 | (rx-check form) | |
513 | (concat "\\(?:" (cadr form) "\\)")) | |
514 | ||
515 | ||
516 | ;;;###autoload | |
517 | (defun rx-to-string (form &optional no-group) | |
518 | "Parse and produce code for regular expression FORM. | |
519 | FORM is a regular expression in sexp form. | |
520 | NO-GROUP non-nil means don't put shy groups around the result." | |
521 | (cond ((stringp form) | |
522 | (regexp-quote form)) | |
523 | ((integerp form) | |
524 | (regexp-quote (char-to-string form))) | |
525 | ((symbolp form) | |
526 | (let ((info (rx-info form))) | |
527 | (cond ((stringp info) | |
528 | info) | |
529 | ((null info) | |
6b61353c | 530 | (error "Unknown rx form `%s'" form)) |
a1506d29 | 531 | (t |
12c64503 GM |
532 | (funcall (nth 0 info) form))))) |
533 | ((consp form) | |
534 | (let ((info (rx-info (car form)))) | |
535 | (unless (consp info) | |
6b61353c | 536 | (error "Unknown rx form `%s'" (car form))) |
12c64503 GM |
537 | (let ((result (funcall (nth 0 info) form))) |
538 | (if (or no-group (string-match "\\`\\\\[(]" result)) | |
539 | result | |
540 | (concat "\\(?:" result "\\)"))))) | |
541 | (t | |
6b61353c | 542 | (error "rx syntax error at `%s'" form)))) |
12c64503 GM |
543 | |
544 | ||
545 | ;;;###autoload | |
546 | (defmacro rx (regexp) | |
547 | "Translate a regular expression REGEXP in sexp form to a regexp string. | |
548 | See also `rx-to-string' for how to do such a translation at run-time. | |
549 | ||
550 | The following are valid subforms of regular expressions in sexp | |
551 | notation. | |
552 | ||
553 | STRING | |
554 | matches string STRING literally. | |
555 | ||
556 | CHAR | |
557 | matches character CHAR literally. | |
558 | ||
559 | `not-newline' | |
560 | matches any character except a newline. | |
561 | . | |
562 | `anything' | |
563 | matches any character | |
564 | ||
565 | `(any SET)' | |
566 | matches any character in SET. SET may be a character or string. | |
567 | Ranges of characters can be specified as `A-Z' in strings. | |
568 | ||
a1506d29 | 569 | '(in SET)' |
12c64503 GM |
570 | like `any'. |
571 | ||
572 | `(not (any SET))' | |
573 | matches any character not in SET | |
574 | ||
575 | `line-start' | |
576 | matches the empty string, but only at the beginning of a line | |
577 | in the text being matched | |
578 | ||
579 | `line-end' | |
580 | is similar to `line-start' but matches only at the end of a line | |
581 | ||
582 | `string-start' | |
583 | matches the empty string, but only at the beginning of the | |
584 | string being matched against. | |
585 | ||
586 | `string-end' | |
587 | matches the empty string, but only at the end of the | |
588 | string being matched against. | |
589 | ||
590 | `buffer-start' | |
591 | matches the empty string, but only at the beginning of the | |
592 | buffer being matched against. | |
593 | ||
594 | `buffer-end' | |
595 | matches the empty string, but only at the end of the | |
596 | buffer being matched against. | |
597 | ||
598 | `point' | |
599 | matches the empty string, but only at point. | |
600 | ||
601 | `word-start' | |
602 | matches the empty string, but only at the beginning or end of a | |
603 | word. | |
604 | ||
605 | `word-end' | |
606 | matches the empty string, but only at the end of a word. | |
607 | ||
608 | `word-boundary' | |
609 | matches the empty string, but only at the beginning or end of a | |
610 | word. | |
611 | ||
612 | `(not word-boundary)' | |
613 | matches the empty string, but not at the beginning or end of a | |
614 | word. | |
615 | ||
616 | `digit' | |
617 | matches 0 through 9. | |
618 | ||
619 | `control' | |
620 | matches ASCII control characters. | |
621 | ||
622 | `hex-digit' | |
623 | matches 0 through 9, a through f and A through F. | |
624 | ||
625 | `blank' | |
626 | matches space and tab only. | |
627 | ||
628 | `graphic' | |
629 | matches graphic characters--everything except ASCII control chars, | |
630 | space, and DEL. | |
631 | ||
632 | `printing' | |
633 | matches printing characters--everything except ASCII control chars | |
634 | and DEL. | |
635 | ||
636 | `alphanumeric' | |
637 | matches letters and digits. (But at present, for multibyte characters, | |
638 | it matches anything that has word syntax.) | |
639 | ||
640 | `letter' | |
641 | matches letters. (But at present, for multibyte characters, | |
642 | it matches anything that has word syntax.) | |
643 | ||
644 | `ascii' | |
645 | matches ASCII (unibyte) characters. | |
646 | ||
647 | `nonascii' | |
648 | matches non-ASCII (multibyte) characters. | |
649 | ||
650 | `lower' | |
651 | matches anything lower-case. | |
652 | ||
653 | `upper' | |
654 | matches anything upper-case. | |
655 | ||
656 | `punctuation' | |
657 | matches punctuation. (But at present, for multibyte characters, | |
658 | it matches anything that has non-word syntax.) | |
659 | ||
660 | `space' | |
661 | matches anything that has whitespace syntax. | |
662 | ||
663 | `word' | |
664 | matches anything that has word syntax. | |
665 | ||
666 | `(syntax SYNTAX)' | |
667 | matches a character with syntax SYNTAX. SYNTAX must be one | |
668 | of the following symbols. | |
669 | ||
670 | `whitespace' (\\s- in string notation) | |
671 | `punctuation' (\\s.) | |
672 | `word' (\\sw) | |
673 | `symbol' (\\s_) | |
674 | `open-parenthesis' (\\s() | |
675 | `close-parenthesis' (\\s)) | |
676 | `expression-prefix' (\\s') | |
677 | `string-quote' (\\s\") | |
678 | `paired-delimiter' (\\s$) | |
679 | `escape' (\\s\\) | |
680 | `character-quote' (\\s/) | |
681 | `comment-start' (\\s<) | |
682 | `comment-end' (\\s>) | |
6b61353c KH |
683 | `string-delimiter' (\\s|) |
684 | `comment-delimiter' (\\s!) | |
12c64503 GM |
685 | |
686 | `(not (syntax SYNTAX))' | |
687 | matches a character that has not syntax SYNTAX. | |
688 | ||
689 | `(category CATEGORY)' | |
690 | matches a character with category CATEGORY. CATEGORY must be | |
691 | either a character to use for C, or one of the following symbols. | |
692 | ||
693 | `consonant' (\\c0 in string notation) | |
694 | `base-vowel' (\\c1) | |
695 | `upper-diacritical-mark' (\\c2) | |
696 | `lower-diacritical-mark' (\\c3) | |
697 | `tone-mark' (\\c4) | |
698 | `symbol' (\\c5) | |
699 | `digit' (\\c6) | |
700 | `vowel-modifying-diacritical-mark' (\\c7) | |
701 | `vowel-sign' (\\c8) | |
702 | `semivowel-lower' (\\c9) | |
703 | `not-at-end-of-line' (\\c<) | |
704 | `not-at-beginning-of-line' (\\c>) | |
705 | `alpha-numeric-two-byte' (\\cA) | |
706 | `chinse-two-byte' (\\cC) | |
707 | `greek-two-byte' (\\cG) | |
708 | `japanese-hiragana-two-byte' (\\cH) | |
709 | `indian-tow-byte' (\\cI) | |
710 | `japanese-katakana-two-byte' (\\cK) | |
711 | `korean-hangul-two-byte' (\\cN) | |
712 | `cyrillic-two-byte' (\\cY) | |
6b61353c | 713 | `combining-diacritic' (\\c^) |
12c64503 GM |
714 | `ascii' (\\ca) |
715 | `arabic' (\\cb) | |
716 | `chinese' (\\cc) | |
717 | `ethiopic' (\\ce) | |
718 | `greek' (\\cg) | |
719 | `korean' (\\ch) | |
720 | `indian' (\\ci) | |
721 | `japanese' (\\cj) | |
722 | `japanese-katakana' (\\ck) | |
723 | `latin' (\\cl) | |
724 | `lao' (\\co) | |
725 | `tibetan' (\\cq) | |
726 | `japanese-roman' (\\cr) | |
727 | `thai' (\\ct) | |
728 | `vietnamese' (\\cv) | |
729 | `hebrew' (\\cw) | |
730 | `cyrillic' (\\cy) | |
731 | `can-break' (\\c|) | |
732 | ||
733 | `(not (category CATEGORY))' | |
734 | matches a character that has not category CATEGORY. | |
735 | ||
736 | `(and SEXP1 SEXP2 ...)' | |
737 | matches what SEXP1 matches, followed by what SEXP2 matches, etc. | |
738 | ||
739 | `(submatch SEXP1 SEXP2 ...)' | |
740 | like `and', but makes the match accessible with `match-end', | |
741 | `match-beginning', and `match-string'. | |
742 | ||
743 | `(group SEXP1 SEXP2 ...)' | |
744 | another name for `submatch'. | |
745 | ||
746 | `(or SEXP1 SEXP2 ...)' | |
747 | matches anything that matches SEXP1 or SEXP2, etc. If all | |
748 | args are strings, use `regexp-opt' to optimize the resulting | |
749 | regular expression. | |
750 | ||
751 | `(minimal-match SEXP)' | |
752 | produce a non-greedy regexp for SEXP. Normally, regexps matching | |
6b61353c | 753 | zero or more occurrences of something are \"greedy\" in that they |
12c64503 GM |
754 | match as much as they can, as long as the overall regexp can |
755 | still match. A non-greedy regexp matches as little as possible. | |
756 | ||
757 | `(maximal-match SEXP)' | |
0a6cac62 | 758 | produce a greedy regexp for SEXP. This is the default. |
12c64503 GM |
759 | |
760 | `(zero-or-more SEXP)' | |
761 | matches zero or more occurrences of what SEXP matches. | |
762 | ||
763 | `(0+ SEXP)' | |
764 | like `zero-or-more'. | |
765 | ||
766 | `(* SEXP)' | |
767 | like `zero-or-more', but always produces a greedy regexp. | |
768 | ||
769 | `(*? SEXP)' | |
770 | like `zero-or-more', but always produces a non-greedy regexp. | |
771 | ||
772 | `(one-or-more SEXP)' | |
773 | matches one or more occurrences of A. | |
a1506d29 | 774 | |
12c64503 GM |
775 | `(1+ SEXP)' |
776 | like `one-or-more'. | |
777 | ||
778 | `(+ SEXP)' | |
779 | like `one-or-more', but always produces a greedy regexp. | |
780 | ||
781 | `(+? SEXP)' | |
782 | like `one-or-more', but always produces a non-greedy regexp. | |
783 | ||
784 | `(zero-or-one SEXP)' | |
785 | matches zero or one occurrences of A. | |
a1506d29 | 786 | |
12c64503 GM |
787 | `(optional SEXP)' |
788 | like `zero-or-one'. | |
789 | ||
790 | `(? SEXP)' | |
791 | like `zero-or-one', but always produces a greedy regexp. | |
792 | ||
793 | `(?? SEXP)' | |
794 | like `zero-or-one', but always produces a non-greedy regexp. | |
795 | ||
796 | `(repeat N SEXP)' | |
797 | matches N occurrences of what SEXP matches. | |
798 | ||
799 | `(repeat N M SEXP)' | |
800 | matches N to M occurrences of what SEXP matches. | |
801 | ||
6b61353c KH |
802 | `(backref N)' |
803 | matches what was matched previously by submatch N. | |
804 | ||
805 | `(backref N)' | |
806 | matches what was matched previously by submatch N. | |
807 | ||
12c64503 | 808 | `(eval FORM)' |
6b61353c KH |
809 | evaluate FORM and insert result. If result is a string, |
810 | `regexp-quote' it. | |
12c64503 GM |
811 | |
812 | `(regexp REGEXP)' | |
6b61353c | 813 | include REGEXP in string notation in the result." |
12c64503 | 814 | |
6b61353c | 815 | (rx-to-string regexp)) |
12c64503 GM |
816 | |
817 | (provide 'rx) | |
818 | ||
6b61353c | 819 | ;;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b |
12c64503 | 820 | ;;; rx.el ends here |